In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


import other libraries

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import cohen_kappa_score, make_scorer, classification_report, accuracy_score
import re
from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from gensim.models import Word2Vec

Read in the dataset

In [3]:
df_train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
df_sub = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')

Data cleaning

In [4]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\xa0"," ",phrase)
    return phrase.strip()

def clean_text(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return decontracted(x)

In [5]:
df_train['full_text'] = df_train['full_text'].apply(clean_text)
df_test['full_text'] = df_test['full_text'].apply(clean_text)


In [6]:
df_train['full_text'][0]

'many people have car where they live. the thing they do not know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in vauban,germany they dont have that proble because percent of vauban is families do not own cars,and percent sold a car to move there. street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the french and swiss borders. you probaly will not see a car in vauban is streets because they are completely "car free" but if some that lives in vauban that owns a car ownership is allowed,but there are only two places that you can park a large garages at the edge of the development,where a car owner buys a space but it not cheap to buy one they sell the space for you car for $, along with a home. the vauban people completed this in ,they said that this an example of a growing trend in europe,the untile states and some where else are suburba

feature extraction - maybe add in more features like bag of words/ keywords?

create a word_count and text length column

In [7]:
df_train['word_count'] = df_train["full_text"].str.split().str.len()
df_test['word_count'] = df_test["full_text"].str.split().str.len()

In [8]:
df_train['length'] = df_train["full_text"].str.len()
df_test['length'] = df_test["full_text"].str.len()

In [9]:
df_train.head()

Unnamed: 0,essay_id,full_text,score,word_count,length
0,000d118,many people have car where they live. the thin...,3,496,2647
1,000fe60,i am a scientist at nasa that is discussing th...,3,336,1668
2,001ab80,people always wish they had the same technolog...,4,553,3068
3,001bdc0,"we all heard about venus, the planet without a...",4,450,2679
4,002ba53,"dear, state senator this is a letter to argue ...",3,377,2188


attempt at creating vector embeddings:

In [10]:
word2vec_model = Word2Vec(df_train['full_text'], 
                          vector_size=100,
                          min_count=1,
                          window=5, 
                          )

In [11]:
def vectorize_text(text, word2vec_model):
    text_vector = np.zeros(100)  # Initialize a vector of zeros (100-dimensional)
    for word in text.split():  # Split text into words
        if word in word2vec_model.wv:  # Check if word is in the vocabulary
            text_vector += word2vec_model.wv[word]  # Add word vector to the text vector
    return text_vector

In [12]:
# Apply the vectorize_text function to each text entry in train_df['full_text']
df_train['text_vector'] = df_train['full_text'].apply(lambda x: vectorize_text(x, word2vec_model))
df_test['text_vector'] = df_test['full_text'].apply(lambda x: vectorize_text(x, word2vec_model))


In [13]:
df_train.head()

Unnamed: 0,essay_id,full_text,score,word_count,length,text_vector
0,000d118,many people have car where they live. the thin...,3,496,2647,"[-0.6929380334913731, 2.866185742430389, -3.85..."
1,000fe60,i am a scientist at nasa that is discussing th...,3,336,1668,"[-0.33317258208990097, 2.139420425519347, -4.4..."
2,001ab80,people always wish they had the same technolog...,4,553,3068,"[-0.026480753906071186, 1.453783854842186, -2...."
3,001bdc0,"we all heard about venus, the planet without a...",4,450,2679,"[-0.0007969895377755165, 3.742163436487317, -3..."
4,002ba53,"dear, state senator this is a letter to argue ...",3,377,2188,"[-0.29576310981065035, -0.024010476656258106, ..."


In [14]:
# X_train = df_train['full_text']
# y_train = df_train['score']
# X_test = df_test['full_text']
# y_test = df_sub['score']

this method trains and evaluates each pipeline 
returns the y_pred and quadratic weighted kappa score of the model

In [15]:
def train_and_evaluate_pipeline(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    
    print()
    print('Classifier used: ', pipeline.named_steps['classifier'])
    print('y_test: ', y_test.tolist()[0:10])
    print('y_pred: ', y_pred.tolist()[0:10])
    print('quadratic weighted kappa score: ', kappa)
    
    return pipeline, y_pred

method for performing cross_validation of pipelines and showing their average performances

In [16]:
#define quadratic weighted kappa as a custom scorer
qwk_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

def pipeline_cross_validation(pipeline, X_train, y_train, qwk_scorer=qwk_scorer):
    # Define cross-validation strategy and perform cross-validation
    scoring = {'accuracy': 'accuracy', 'f1_weighted': 'f1_weighted', 'qwk': qwk_scorer}
    cv_results = cross_validate(pipeline, X_train, y_train, cv=5, scoring=scoring)

    # Print cross-validation results
    print("Mean Accuracy:", np.mean(cv_results['test_accuracy']))
    print("Mean Weighted F1-score:", np.mean(cv_results['test_f1_weighted']))
    print("Mean Quadratic Weighted Kappa Score:", np.mean(cv_results['test_qwk']))

create the tfidf

In [17]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.95,)
# tfidf = tfidf_vectorizer.fit_transform(X_train)


create the multinomial logistic regression classifier

In [18]:
logreg_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)

create a pipeline for logistic regression classifier

In [19]:
# Define a transformer to extract 'text_vector' from DataFrame
def get_text_vector(df):
    return np.vstack(df['text_vector'])

In [20]:
tfidf_preprocessor = ColumnTransformer(
    transformers=[
        ('text', tfidf_vectorizer, 'full_text'),
        ('word_count', StandardScaler(), ['word_count']),
        ('length', StandardScaler(), ['length'])
    ],
    remainder='passthrough'
)

w2v_preprocessor = ColumnTransformer(
    transformers=[
#         ('text', tfidf_vectorizer, 'full_text'),
        ('text_vector', FunctionTransformer(get_text_vector, validate=False), ['text_vector']),
        ('word_count', StandardScaler(), ['word_count']),
        ('length', StandardScaler(), ['length'])
    ],
    remainder='passthrough'
)

In [21]:
logreg_pipeline = Pipeline(steps=[
#     ('tfidf', tfidf_vectorizer),
    ('preprocessor', tfidf_preprocessor),
    ('classifier', logreg_clf)
])

w2v_logreg_pipeline = Pipeline(steps=[
    ('preprocessor', w2v_preprocessor),
    ('classifier', logreg_clf)
])

fit and make a prediction

In [22]:
# df_sub['essay_id'].tolist()
# df_train.loc[(df_train['essay_id'] == '000d118') | (df_train['essay_id'] =='000fe60') | (df_train['essay_id'] =='001ab80')]
# sub_df_data = df_train.loc[(df_train['essay_id'] == '000d118') | (df_train['essay_id'] =='000fe60') | (df_train['essay_id'] =='001ab80')]
# X_test3 = tfidf_vectorizer.transform(df_test['full_text'])

In [23]:
X = df_train[['full_text', 'word_count', 'length']]
# X = df_train['full_text']
y = df_train['score']
sub_X = df_test[['full_text', 'word_count', 'length']]
sub_y = df_sub['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

X2 = df_train[['word_count', 'length', 'text_vector']]
sub_X2 = df_test[['word_count', 'length', 'text_vector']]

# X2 = np.array(df_train['text_vector'].tolist())
# sub_X2 = np.array(df_test['text_vector'].tolist())

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, train_size = 0.8)


# trained_logreg_pipeline, logreg_y_pred = train_and_evaluate_pipeline(logreg_pipeline, X_train, y_train, X_test, y_test)

In [24]:
trained_logreg_pipeline, logreg_y_pred = train_and_evaluate_pipeline(logreg_pipeline, X_train, y_train, X_test, y_test)


Classifier used:  LogisticRegression(max_iter=500, multi_class='multinomial')
y_test:  [2, 2, 3, 4, 5, 3, 3, 2, 3, 3]
y_pred:  [2, 1, 3, 4, 4, 3, 2, 2, 4, 3]
quadratic weighted kappa score:  0.7210035214939079


commented off just to make the notebook save and rerun properly, need to use a different clf if plan to keep

In [25]:
# trained_logreg_pipeline2, logreg_y_pred2 = train_and_evaluate_pipeline(w2v_logreg_pipeline, X_train2, y_train2, X_test2, y_test2)

In [26]:
pipeline_cross_validation(logreg_pipeline, X_train, y_train)

Mean Accuracy: 0.5905381003972554
Mean Weighted F1-score: 0.5795778162668027
Mean Quadratic Weighted Kappa Score: 0.7229012629910739


In [27]:
print(classification_report(y_test, logreg_y_pred))

              precision    recall  f1-score   support

           1       0.61      0.25      0.36       272
           2       0.63      0.67      0.65       930
           3       0.56      0.70      0.62      1216
           4       0.61      0.53      0.56       831
           5       0.49      0.32      0.39       184
           6       0.25      0.07      0.11        29

    accuracy                           0.59      3462
   macro avg       0.53      0.42      0.45      3462
weighted avg       0.59      0.59      0.58      3462



In [28]:
print("Model prediction on submission essays: ", trained_logreg_pipeline.predict(sub_X).tolist())
print('Submission essays actual classes: ', sub_y.tolist())

Model prediction on submission essays:  [3, 3, 4]
Submission essays actual classes:  [3, 3, 4]


In [29]:
# logregcv_clf = LogisticRegressionCV(multi_class='multinomial', cv=5, solver='lbfgs', max_iter=1000, refit=True, scoring=qwk_scorer)

create a lightgbm model pipeline

lgbm over xgboost for performance and scalability? maybe elaborate more

In [30]:
lgbm_clf = LGBMClassifier(objective='multiclass', num_class=6, metric='multi_logloss', num_leaves=31, learning_rate=0.1, feature_fraction=0.6)

In [31]:
lgbm_pipeline = Pipeline(steps=[
#     ('tfidf', tfidf_vectorizer),   # TF-IDF Vectorizer
    ('preprocessor', tfidf_preprocessor),
    ('classifier', lgbm_clf) # LightGBM Classifier
])

use some GridSearchCV to find the best parameters

In [32]:
# parameter grid used for the grid search:
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'feature_fraction': [0.6, 0.8, 0.9]
}

In [33]:
# # create and fit the gridsearch
# grid_search = GridSearchCV(estimator=lgbm_clf, param_grid=param_grid, cv=3, scoring=qwk_scorer)
# grid_search.fit(tfidf_vectorizer.fit_transform(X_train), y_train)

# # Print the best parameters and best score
# print("Best Parameters: ", grid_search.best_params_)
# print("Best Score (quadratic weighted kappa score): ", grid_search.best_score_)

In [34]:
trained_lgbm_pipeline, lgbm_y_pred = train_and_evaluate_pipeline(lgbm_pipeline, X_train, y_train, X_test, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112517
[LightGBM] [Info] Number of data points in the train set: 13845, number of used features: 444
[LightGBM] [Info] Start training from score -2.648127
[LightGBM] [Info] Start training from score -1.294767
[LightGBM] [Info] Start training from score -1.005767
[LightGBM] [Info] Start training from score -1.498136
[LightGBM] [Info] Start training from score -2.868723
[LightGBM] [Info] Start training from score -4.691492

Classifier used:  LGBMClassifier(feature_fraction=0.6, metric='multi_logloss', num_class=6,
               objective='multiclass')
y_test:  [2, 2, 3, 4, 5, 3, 3, 2, 3, 3]
y_pred:  [2, 2, 3, 4, 4, 3, 2, 2, 4, 3]
quadratic weighted kappa score:  0.7380728425658367


commented off just to make the notebook run faster for now

In [35]:
# pipeline_cross_validation(lgbm_pipeline, X_train, y_train)

In [36]:
print(classification_report(y_test, lgbm_y_pred))

              precision    recall  f1-score   support

           1       0.60      0.31      0.41       272
           2       0.65      0.62      0.64       930
           3       0.58      0.67      0.62      1216
           4       0.60      0.64      0.62       831
           5       0.49      0.41      0.45       184
           6       0.25      0.07      0.11        29

    accuracy                           0.60      3462
   macro avg       0.53      0.45      0.47      3462
weighted avg       0.60      0.60      0.60      3462



In [37]:
# trained_lgbm_pipeline.predict(df_test[['full_text', 'word_count', 'length']]).tolist()
print("Model prediction on submission essays: ", trained_lgbm_pipeline.predict(sub_X).tolist())
print('Submission essays actual classes: ', sub_y.tolist())

Model prediction on submission essays:  [3, 3, 4]
Submission essays actual classes:  [3, 3, 4]


LogisticRegressionCV classifier model and pipeline

In [38]:
# logregcv_clf = LogisticRegressionCV(multi_class='multinomial', cv=3, solver='lbfgs', max_iter=1000, refit=True, scoring=qwk_scorer)

In [39]:
# logregcv_pipeline = Pipeline(steps=[
# #     ('tfidf', tfidf_vectorizer),
#     ('preprocessor', preprocessor),
#     ('classifier', logregcv_clf)
# ])

In [40]:
# logregcv_y_pred = train_and_evaluate_pipeline(logregcv_pipeline, X_train, y_train, X_test, y_test)

# ***Things to do:***

- put explanations and more markup in notebook

- more EDA, some diagrams and graphs on the dataset 

    ideas: - compare the cross validation qwk scores, group by scores and show word lengths etc

- try word2vec for vector embeddings and compare with tfidf - make w2v more accurate somehow

- find more models - potential: xgboostclassifier, ordinal logistic regression, ordinal random forest,

- overall increase the performance of the models they arent predicting the test dataset 100% correctly

