<a href="https://www.kaggle.com/code/lorenzojayd/aes-2-spell-check?scriptVersionId=176697763" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Dependencies

## Libraries

In [1]:
!pip install '/kaggle/input/pyspellchecker-0-8-1/pyspellchecker-0.8.1-py3-none-any.whl'

Processing /kaggle/input/pyspellchecker-0-8-1/pyspellchecker-0.8.1-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [2]:
import numpy as np
import pandas as pd
import re

import nltk
from spellchecker import SpellChecker

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, cohen_kappa_score, make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier

## Data

In [3]:
data_train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
data_test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
sample_submission = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')

# A quick look at the data

In [4]:
# Train data summary
print('Train data')
display(data_train.head(2))
display(data_train.info())
print('-' * 75)

# Test data summary
print('\nTest data')
display(data_test.head())
display(data_test.info())
print('-' * 75)

# Sample submission file
print('\nSample submission file')
display(sample_submission.head(2))

Train data


Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   full_text  17307 non-null  object
 2   score      17307 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 405.8+ KB


None

---------------------------------------------------------------------------

Test data


Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   3 non-null      object
 1   full_text  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


None

---------------------------------------------------------------------------

Sample submission file


Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3


# Data Wrangling

## Preprocessing
Partially inspired by: https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-cv-0-799-lb-0-799

In [5]:
def removeHTML(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def preprocess(text):
    # Transform all characters into lowercase
    text = text.lower()
    # Remove HTML
    text = removeHTML(text)
    # Remove numbers
    text = re.sub("'\d+|\d+", '', text)
    # Replace repeating spaces and punctuation with one character
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\.+", ".", text)
    text = re.sub(r"\,", ",", text)
    # Remove URL strings
    text = re.sub("http\w+", '', text)
    # Replace non-breaking space with space
    text = re.sub(u'\xa0', u' ', text)
    # Remove leading and trailing whitespaces after other transformations
    text = text.strip()
    return text

In [6]:
# Preprocess data
df_train = data_train.copy()
df_train['full_text'] = data_train['full_text'].apply(lambda text: preprocess(text))

df_test = data_test.copy()
df_test['full_text'] = data_test['full_text'].apply(lambda text: preprocess(text))

# Label encoding the target feature
enc = LabelEncoder()
df_train['score_encoded'] = enc.fit_transform(df_train['score'])

In [7]:
# Compare original train data with the new train dataframe
display(data_train.head(2))
display(df_train.head(2))

display(data_train['score'].value_counts())
display(df_train['score_encoded'].value_counts())

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3


Unnamed: 0,essay_id,full_text,score,score_encoded
0,000d118,many people have car where they live. the thin...,3,2
1,000fe60,i am a scientist at nasa that is discussing th...,3,2


score
3    6280
2    4723
4    3926
1    1252
5     970
6     156
Name: count, dtype: int64

score_encoded
2    6280
1    4723
3    3926
0    1252
4     970
5     156
Name: count, dtype: int64

## Feature Engineering

In [8]:
def feature_engineer(df):
    # Initialize spell checker
    spell = SpellChecker()
    
    # Separating the original full text into paragraphs
    df['paragraph'] = df['full_text'].apply(lambda text: text.split('\n\n'))
    # Separating the original full text into sentences
    df['sentence'] = df['full_text'].apply(lambda text: nltk.tokenize.sent_tokenize(text))
    # Obtain tokens
    df['word'] = df['full_text'].apply(lambda text: nltk.tokenize.word_tokenize(text))
    
    # Counts the number of paragraphs
    df['paragraph_count'] = df['paragraph'].apply(lambda x: len(x))
    # Counts the number of sentences
    df['sentence_count'] = df['sentence'].apply(lambda x: len(x))
    # Counts the number of words
    df['word_count'] = df['word'].apply(lambda x: len(x))
    # Counts the number of characters
    df['character_count'] = df['full_text'].apply(lambda x: len(x))
    # Counts the misspelled words
    df['misspelled_count'] = df['word'].apply(lambda x: len(spell.unknown(x)))
    
    return df.drop(columns = ['paragraph', 'sentence', 'word'])


In [9]:
## Feature engineering on the train and test data
df_train = feature_engineer(df_train)
df_test = feature_engineer(df_test)

## TF-IDF Vectorization

In [10]:
# Initialize TF-IDF Vectorizer
vec = TfidfVectorizer(strip_accents = 'unicode',
                      analyzer = 'word',
                      min_df = 0.05,
                      max_df = 0.95,
                      sublinear_tf = True,
                      max_features = None,
                     )

# Train data
tfidf_train = vec.fit_transform([i for i in df_train['full_text']])
tfidf_train_arr = tfidf_train.toarray()
df_tfidf_train = pd.DataFrame(tfidf_train_arr)

# Test data
tfidf_test = vec.transform([i for i in df_test['full_text']])
tfidf_test_arr = tfidf_test.toarray()
df_tfidf_test = pd.DataFrame(tfidf_test_arr)

# Combining all of the new features
df_train = pd.concat([df_train, df_tfidf_train], axis = 1)
df_test = pd.concat([df_test, df_tfidf_test], axis = 1)

In [11]:
display(df_train.head(3))
display(df_train.info())
print('-' * 150)
display(df_test.head())
display(df_test.info())

Unnamed: 0,essay_id,full_text,score,score_encoded,paragraph_count,sentence_count,word_count,character_count,misspelled_count,0,...,614,615,616,617,618,619,620,621,622,623
0,000d118,many people have car where they live. the thin...,3,2,1,13,539,2640,27,0.0,...,0.0,0.0,0.0,0.0,0.084513,0.068437,0.0,0.0,0.099545,0.050971
1,000fe60,i am a scientist at nasa that is discussing th...,3,2,1,21,371,1663,12,0.0,...,0.0,0.078426,0.0,0.0,0.0,0.0,0.0,0.0,0.105819,0.0
2,001ab80,people always wish they had the same technolog...,4,3,1,24,605,3065,12,0.050198,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031816,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Columns: 633 entries, essay_id to 623
dtypes: float64(624), int64(7), object(2)
memory usage: 83.6+ MB


None

------------------------------------------------------------------------------------------------------------------------------------------------------


Unnamed: 0,essay_id,full_text,paragraph_count,sentence_count,word_count,character_count,misspelled_count,0,1,2,...,614,615,616,617,618,619,620,621,622,623
0,000d118,many people have car where they live. the thin...,1,13,539,2640,27,0.0,0.039362,0.0,...,0.0,0.0,0.0,0.0,0.084513,0.068437,0.0,0.0,0.099545,0.050971
1,000fe60,i am a scientist at nasa that is discussing th...,1,21,371,1663,12,0.0,0.075796,0.0,...,0.0,0.078426,0.0,0.0,0.0,0.0,0.0,0.0,0.105819,0.0
2,001ab80,people always wish they had the same technolog...,1,24,605,3065,12,0.050198,0.059467,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031816,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Columns: 631 entries, essay_id to 623
dtypes: float64(624), int64(5), object(2)
memory usage: 14.9+ KB


None

In [12]:
# Separating train data into independent features (X) and target feature (y)
X = df_train.drop(columns = ['essay_id', 'full_text', 'score', 'score_encoded'])
y = df_train['score_encoded']

# Splitting train data into training and validation splits
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Obtaining X features from test data
X_test = df_test.drop(columns = ['essay_id', 'full_text'])

# Model Training

In [13]:
# Making the cohen kappa scorer
kappa_scorer = make_scorer(cohen_kappa_score)

# Random search parameters
params = {
    'n_estimators': [10, 100, 300],
    'max_depth': [3, 5, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5],
}

clf = RandomizedSearchCV(XGBClassifier(), param_distributions = params, cv = 3, scoring = kappa_scorer, verbose = 2)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ....learning_rate=0.2, max_depth=5, n_estimators=10; total time=   5.8s
[CV] END ....learning_rate=0.2, max_depth=5, n_estimators=10; total time=   5.8s
[CV] END ....learning_rate=0.2, max_depth=5, n_estimators=10; total time=   5.7s
[CV] END ...learning_rate=0.5, max_depth=3, n_estimators=100; total time=  25.1s
[CV] END ...learning_rate=0.5, max_depth=3, n_estimators=100; total time=  24.8s
[CV] END ...learning_rate=0.5, max_depth=3, n_estimators=100; total time=  22.6s
[CV] END ....learning_rate=0.5, max_depth=5, n_estimators=10; total time=   6.6s
[CV] END ....learning_rate=0.5, max_depth=5, n_estimators=10; total time=   5.5s
[CV] END ....learning_rate=0.5, max_depth=5, n_estimators=10; total time=   5.4s
[CV] END ..learning_rate=0.35, max_depth=8, n_estimators=300; total time= 2.9min
[CV] END ..learning_rate=0.35, max_depth=8, n_estimators=300; total time= 2.9min
[CV] END ..learning_rate=0.35, max_depth=8, n_es

In [14]:
print('Best parameters:', clf.best_params_)
print('Best score:', clf.best_score_)

Best parameters: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.15}
Best score: 0.44857127172442773


# Model Evaluation

In [15]:
y_preds = clf.predict(X_val)

print('Accuracy Score: ', accuracy_score(y_val, y_preds))
print('Cohen Kappa Score: ', cohen_kappa_score(y_val, y_preds))

Accuracy Score:  0.6143847487001733
Cohen Kappa Score:  0.46490359333613196


# Predicting

In [16]:
# Predicting
predictions_encoded = clf.predict(X_test)

# Inverse encoding labels
predictions = enc.inverse_transform(predictions_encoded)

# Submission

In [17]:
# Building submission dataframe
submission = pd.DataFrame({
    'essay_id': df_test['essay_id'],
    'score': predictions,
})

display(submission)

Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3
2,001ab80,4


In [18]:
# Export submission file
submission.to_csv('submission.csv', index = None)