In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv


import other libraries

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
import re
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

Read in the dataset

In [3]:
df_train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
df_sub = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')

Data cleaning

In [4]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\xa0"," ",phrase)
    return phrase.strip()

def clean_text(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return decontracted(x)

In [5]:
df_train['full_text'] = df_train['full_text'].apply(clean_text)
df_test['full_text'] = df_test['full_text'].apply(clean_text)


In [6]:
df_train['full_text'][0]

'many people have car where they live. the thing they do not know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in vauban,germany they dont have that proble because percent of vauban is families do not own cars,and percent sold a car to move there. street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the french and swiss borders. you probaly will not see a car in vauban is streets because they are completely "car free" but if some that lives in vauban that owns a car ownership is allowed,but there are only two places that you can park a large garages at the edge of the development,where a car owner buys a space but it not cheap to buy one they sell the space for you car for $, along with a home. the vauban people completed this in ,they said that this an example of a growing trend in europe,the untile states and some where else are suburba

feature extraction - maybe add in more features like bag of words/ keywords?

create a word_count column

In [7]:
df_train['word_count'] = df_train["full_text"].str.split().str.len()
df_test['word_count'] = df_test["full_text"].str.split().str.len()

In [8]:
df_train.head()

Unnamed: 0,essay_id,full_text,score,word_count
0,000d118,many people have car where they live. the thin...,3,496
1,000fe60,i am a scientist at nasa that is discussing th...,3,336
2,001ab80,people always wish they had the same technolog...,4,553
3,001bdc0,"we all heard about venus, the planet without a...",4,450
4,002ba53,"dear, state senator this is a letter to argue ...",3,377


X and y's, might need to edit?

possible solution i saw is to combine the train and test dfs and then split them again afterwards?

In [9]:
X_train = df_train['full_text']
y_train = df_train['score']
X_test = df_test['full_text']
# y_test = pd.DataFrame()
y_test = df_sub['score']

# X_train = tfidf_vectorizer.fit_transform(df_train['full_text'])
# y_train = df_train['score']
# X_test = tfidf_vectorizer.transform(df_test['full_text'])
# y_test = df_test['target_column']

this method trains and evaluates each pipeline 
returns the y_pred and quadratic weighted kappa score of the model

In [10]:
def train_and_evaluate_pipeline(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    
    print()
    print('Classifier used: ', pipeline.named_steps['classifier'])
    print('y_test: ', y_test.tolist())
    print('y_pred: ', y_pred.tolist())
    print('quadratic weighted kappa score: ', kappa)
    
    return y_pred

create the tfidf

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.95,)
# tfidf = tfidf_vectorizer.fit_transform(X_train)


create the multinomial logistic regression classifier

In [12]:
logreg_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)

create a pipeline for logistic regression classifier

In [13]:
logreg_pipeline = Pipeline(steps=[
    ('tfidf', tfidf_vectorizer),
    ('classifier', logreg_clf)
])

fit and make a prediction

In [14]:
# logreg_pipeline.fit(X_train, y_train)
# y_pred = logreg_pipeline.predict(X_test)

logreg_y_pred = train_and_evaluate_pipeline(logreg_pipeline, X_train, y_train, X_test, y_test)


Classifier used:  LogisticRegression(max_iter=500, multi_class='multinomial')
y_test:  [3, 3, 4]
y_pred:  [1, 3, 4]
quadratic weighted kappa score:  0.6666666666666667


In [15]:
# logregcv_clf = LogisticRegressionCV(multi_class='multinomial', cv=5, solver='lbfgs', max_iter=1000, refit=True, scoring=qwk_scorer)

In [16]:
# y_pred.tolist()

In [17]:
# kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
# kappa


In [18]:
# y_test.tolist()

create a lightgbm model pipeline

lgbm over xgboost for performance and scalability? maybe elaborate more

In [19]:
# params = {
#     'objective': 'multiclass',
#     'num_class': 6,
#     'metric': 'multi_logloss',
#     'num_leaves': 31,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
# }
lgbm_clf = LGBMClassifier(objective='multiclass', num_class=6, metric='multi_logloss', num_leaves=31, learning_rate=0.1, feature_fraction=0.6)

# lgbm_clf = LGBMClassifier(params=params)

In [20]:
lgbm_pipeline = Pipeline(steps=[
    ('tfidf', tfidf_vectorizer),   # TF-IDF Vectorizer
    ('classifier', lgbm_clf) # LightGBM Classifier
])

use some GridSearchCV to find the best parameters

In [21]:
#define quadratic weighted kappa as a custom scorer
qwk_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# parameter grid used for the grid search:
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'feature_fraction': [0.6, 0.8, 0.9]
}

In [22]:
# # create and fit the gridsearch
# grid_search = GridSearchCV(estimator=lgbm_clf, param_grid=param_grid, cv=3, scoring=qwk_scorer)
# grid_search.fit(tfidf_vectorizer.fit_transform(X_train), y_train)

# # Print the best parameters and best score
# print("Best Parameters: ", grid_search.best_params_)
# print("Best Score (quadratic weighted kappa score): ", grid_search.best_score_)

In [23]:
lgbm_y_pred = train_and_evaluate_pipeline(lgbm_pipeline, X_train, y_train, X_test, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075276 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 113475
[LightGBM] [Info] Number of data points in the train set: 17307, number of used features: 445
[LightGBM] [Info] Start training from score -2.626369
[LightGBM] [Info] Start training from score -1.298667
[LightGBM] [Info] Start training from score -1.013741
[LightGBM] [Info] Start training from score -1.483490
[LightGBM] [Info] Start training from score -2.881570
[LightGBM] [Info] Start training from score -4.709010

Classifier used:  LGBMClassifier(feature_fraction=0.6, metric='multi_logloss', num_class=6,
               objective='multiclass')
y_test:  [3, 3, 4]
y_pred:  [3, 2, 4]
quadratic weighted kappa score:  0.6666666666666667


In [24]:
print(classification_report(y_test, lgbm_y_pred))

              precision    recall  f1-score   support

           2       0.00      0.00      0.00         0
           3       1.00      0.50      0.67         2
           4       1.00      1.00      1.00         1

    accuracy                           0.67         3
   macro avg       0.67      0.50      0.56         3
weighted avg       1.00      0.67      0.78         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


LogisticRegressionCV classifier model and pipeline

In [25]:
logregcv_clf = LogisticRegressionCV(multi_class='multinomial', cv=3, solver='lbfgs', max_iter=1000, refit=True, scoring=qwk_scorer)

In [26]:
logregcv_pipeline = Pipeline(steps=[
    ('tfidf', tfidf_vectorizer),
    ('classifier', logregcv_clf)
])

In [27]:
logregcv_y_pred = train_and_evaluate_pipeline(logregcv_pipeline, X_train, y_train, X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Classifier used:  LogisticRegressionCV(cv=3, max_iter=1000, multi_class='multinomial',
                     scoring=make_scorer(cohen_kappa_score, weights=quadratic))
y_test:  [3, 3, 4]
y_pred:  [1, 3, 4]
quadratic weighted kappa score:  0.6666666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# ***Things to do:***

1- split the training dataset, that makes more sense lol i misinterpreted the dataset in the first place, probably should just use tfidf vectorizer on X and use that.... probably dont need the pipelines then

2 - more EDA, some diagrams and graphs on the dataset

3 - try word2vec for vector embeddings and compare with tfidf

4 - find more models - potential: xgboostclassifier, ordinal logistic regression, ordinal random forest,

5 - overall increase the performance of the models they arent predicting the test dataset 100% correctly

