# M - Automated Essay Scoring
_School of Information Technology_<br>
_Monash University Malaysia_<br>
(c) Copyright 2020, Ian Tan & Jun Qing Lim

Steps

- Import libraries
- Read dataset (ASAP)
- Extract features (into file) using EASE
- Conduct machine learning (Sci-kit Learn libraries)
- Evaluate (QWK)

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm #SVR is in SVM
from sklearn.metrics import accuracy_score, confusion_matrix

#### Import the EASE functions, which is located in the ease folder.

In [2]:
import sys
sys.path.insert(1, 'ease')
import create
import grade 
import model_creator 
import predictor_extractor 
import predictor_set 
import util_functions
import essay_set
import feature_extractor

from essay_set import EssaySet
from feature_extractor import FeatureExtractor

## Read Dataset

AES (Hewlett Foundation dataset from Kaggle) in the folder "aes"

In [3]:
train_set = pd.read_csv("asap-aes/training_set_rel3.tsv", sep='\t', encoding="latin-1")
#train_set.head()

In [4]:
# Am filtering just for this current exercise.
# Set 2 has 1,800 essays, sufficient for current work
train_set = train_set[train_set['essay_set'] == 2]  # filter for set 2
#train_set.head()

In [5]:
train_set = train_set.reset_index() # resets index
#train_set.head()

In [6]:
train_set['essay'] = [entry.lower() for entry in train_set['essay']] # lower case for all words in essay
#train_set.head()

In [7]:
essays = train_set['essay']
scores = train_set['domain1_score']

In [8]:
scores.columns = "score"

#### Create the essay sets

In [9]:
# This can take some time, be patient :-)
e_set = EssaySet()

for i in range(len(essays)):
    e_set.add_essay(essays[i], scores[i])

## Extract Features

In [10]:
f_extractor = FeatureExtractor()

In [11]:
length = f_extractor.gen_length_feats(e_set)
length_df = pd.DataFrame(
    length, 
    columns = [
        'chars', 
        'words', 
        'commas', 
        'apostrophes', 
        'punctuations', 
        'avg_word_length', 
        'POS', 
        'POS/total_words'
    ]
)

#### Collate the essay prompts
This consist of one essay from each set

In [12]:
essay_prompts = []

# Takes a bit of time also :)
for i in range(1,9):
    file = "prompts/set" + str(i) + ".txt"
    f = open(file, "r", encoding="latin-1") # there are some 0x9x characters, hence need to specify encoding
    essay_prompts.append(f.read())
    
def get_essay_prompt(essay_set):
    return essay_prompts[essay_set-1]

In [13]:
# Unsure how this works
e_set.update_prompt(get_essay_prompt(2))

# Need more explanation on how this works - look into EASE

prompts = f_extractor.gen_prompt_feats(e_set)
prompts_df = pd.DataFrame(prompts, columns = ['prompt_words', 'prompt_words/total_words', 'synonym_words', 'synonym_words/total_words'])

In [14]:
e_set

<essay_set.EssaySet at 0x1efd7badc50>

In [15]:
# Another process that takes sometime to process
unstemmed = util_functions.get_vocab_essays_count(e_set._text, e_set._score)
stemmed = util_functions.get_vocab_essays_count(e_set._clean_stem_text, e_set._score)

bow = list(map(lambda a,b:[a,b], unstemmed, stemmed))
bow_df = pd.DataFrame(bow, columns = ['unstemmed', 'stemmed'])

In [16]:
features = pd.concat([length_df, prompts_df, bow_df], axis=1, sort=False)

In [17]:
features.head()

Unnamed: 0,chars,words,commas,apostrophes,punctuations,avg_word_length,POS,POS/total_words,prompt_words,prompt_words/total_words,synonym_words,synonym_words/total_words,unstemmed,stemmed
0,2639.0,527.0,15.0,13.0,21.0,5.00759,524.330784,0.994935,220.0,0.417457,112.0,0.212524,584,559
1,841.0,180.0,5.0,2.0,3.0,4.672222,178.6629,0.992572,82.0,0.455556,66.0,0.366667,210,210
2,1181.0,261.0,12.0,15.0,14.0,4.524904,257.992218,0.988476,144.0,0.551724,83.0,0.318008,291,285
3,2705.0,527.0,22.0,6.0,31.0,5.132827,521.65392,0.989856,245.0,0.464896,131.0,0.248577,547,528
4,2394.0,501.0,25.0,15.0,34.0,4.778443,484.298031,0.966663,216.0,0.431138,117.0,0.233533,591,562


In [18]:
# Export features to a file for next stage (optional)
dataset = features.merge(scores, left_index=True, right_index=True)

In [19]:
dataset.head()

Unnamed: 0,chars,words,commas,apostrophes,punctuations,avg_word_length,POS,POS/total_words,prompt_words,prompt_words/total_words,synonym_words,synonym_words/total_words,unstemmed,stemmed,domain1_score
0,2639.0,527.0,15.0,13.0,21.0,5.00759,524.330784,0.994935,220.0,0.417457,112.0,0.212524,584,559,4
1,841.0,180.0,5.0,2.0,3.0,4.672222,178.6629,0.992572,82.0,0.455556,66.0,0.366667,210,210,1
2,1181.0,261.0,12.0,15.0,14.0,4.524904,257.992218,0.988476,144.0,0.551724,83.0,0.318008,291,285,2
3,2705.0,527.0,22.0,6.0,31.0,5.132827,521.65392,0.989856,245.0,0.464896,131.0,0.248577,547,528,4
4,2394.0,501.0,25.0,15.0,34.0,4.778443,484.298031,0.966663,216.0,0.431138,117.0,0.233533,591,562,4


In [20]:
dataset.columns = ['chars', 'words', 'commas', 'apostrophes', 'punctuations',
       'avg_word_length', 'POS', 'POS/total_words', 'prompt_words',
       'prompt_words/total_words', 'synonym_words',
       'synonym_words/total_words', 'unstemmed', 'stemmed', 'score']

In [21]:
dataset.head()

Unnamed: 0,chars,words,commas,apostrophes,punctuations,avg_word_length,POS,POS/total_words,prompt_words,prompt_words/total_words,synonym_words,synonym_words/total_words,unstemmed,stemmed,score
0,2639.0,527.0,15.0,13.0,21.0,5.00759,524.330784,0.994935,220.0,0.417457,112.0,0.212524,584,559,4
1,841.0,180.0,5.0,2.0,3.0,4.672222,178.6629,0.992572,82.0,0.455556,66.0,0.366667,210,210,1
2,1181.0,261.0,12.0,15.0,14.0,4.524904,257.992218,0.988476,144.0,0.551724,83.0,0.318008,291,285,2
3,2705.0,527.0,22.0,6.0,31.0,5.132827,521.65392,0.989856,245.0,0.464896,131.0,0.248577,547,528,4
4,2394.0,501.0,25.0,15.0,34.0,4.778443,484.298031,0.966663,216.0,0.431138,117.0,0.233533,591,562,4


In [22]:
dataset.to_csv('maes_features.csv')

# End

Can just use the features and score for the X and y but just to keep to certain convention if reading back from the CSV file above.


In [23]:
X = dataset.iloc[:,0:13].values.astype(float)
y = dataset.iloc[:,13:14].values.astype(float)

In [24]:
X

array([[2.63900000e+03, 5.27000000e+02, 1.50000000e+01, ...,
        1.12000000e+02, 2.12523719e-01, 5.84000000e+02],
       [8.41000000e+02, 1.80000000e+02, 5.00000000e+00, ...,
        6.60000000e+01, 3.66666667e-01, 2.10000000e+02],
       [1.18100000e+03, 2.61000000e+02, 1.20000000e+01, ...,
        8.30000000e+01, 3.18007663e-01, 2.91000000e+02],
       ...,
       [6.05000000e+02, 1.32000000e+02, 7.00000000e+00, ...,
        3.40000000e+01, 2.57575758e-01, 1.64000000e+02],
       [2.73700000e+03, 5.61000000e+02, 2.20000000e+01, ...,
        1.68000000e+02, 2.99465241e-01, 5.42000000e+02],
       [2.46200000e+03, 5.20000000e+02, 8.00000000e+00, ...,
        1.65000000e+02, 3.17307692e-01, 5.14000000e+02]])

In [25]:
X.shape

(1800, 13)

In [26]:
y.shape

(1800, 1)

#### Conduct Feature Scaling

In [27]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

In [28]:
len(X)

1800

In [29]:
len(y)

1800

#### Split the train and test sets

In [30]:
# To split the train / test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Have a look at the first few lines
print(X_train[:5, :])

[[ -0.79591812  -0.79274203  -0.52192486   2.266733    -0.57079758
   -0.17080642  -0.7857436    0.44590775  -0.52651768   1.35976342
   -0.4291183    1.3780107   -0.99534758]
 [ -0.51618611  -0.55699135  -0.43044826  -1.01271031  -0.17185984
    0.42231766  -0.56025533  -0.27271162  -0.55065849  -0.04485538
   -0.4291183    0.36655217  -0.75801391]
 [ -0.63721302   1.55901482  -1.25373772  -1.01271031  35.44758104
  -11.9319879    1.5781679    1.16155342   5.67766946   9.60384922
   -2.02112129  -6.02218295  -2.41310399]
 [  0.84707926   0.73676243  -0.06454183   1.44687217   0.39805121
    0.9216657    0.74323966   0.6514437    0.6081002   -0.31229061
    0.42293964  -0.7535938    1.10942735]
 [  0.63242776   0.60451204   0.48431781   1.61084433   0.284069
    0.34351803   0.61987387   1.05746662   0.51153697  -0.22908215
    0.28840417  -0.78537985   0.77216371]]


#### Conduct the fit

In [37]:
from sklearn.svm import SVR
# most important SVR parameter is Kernel type. It can be #linear,polynomial or gaussian SVR. We have a non-linear condition #so we can select polynomial or gaussian but here we select RBF(a #gaussian type) kernel.
# kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
# maybe use poly and increase the degree
regressor = SVR(kernel='rbf')
regressor.fit(X_train,y_train.ravel())



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

#### Test / Predict the fit

In [38]:
# Not used yet as I don't have a sample X
y_pred = regressor.predict(X_test)

#### Accuracy Score

In [54]:
y_pred

array([ 1.51140762e+00,  1.45870685e+00, -1.02579765e+00,  1.12331094e-01,
        1.68596403e+00,  2.73007368e-01,  2.92951695e-01,  3.69565214e-01,
        6.33960903e-01, -6.31943970e-01,  1.65670525e+00, -5.62866375e-02,
       -4.51494973e-01, -8.62604289e-01, -6.55548912e-01,  1.35538061e+00,
       -1.29491934e+00, -8.78474514e-01, -1.32786533e+00, -6.55394810e-02,
        1.62825663e+00,  1.82544289e+00, -5.48874425e-01, -2.25411409e-01,
       -6.45051206e-01, -8.01076265e-01, -3.86322583e-01, -5.20883485e-01,
        4.99788938e-01, -3.53311427e-01,  3.25372250e-02,  1.73194476e+00,
       -9.90946147e-01,  1.70545785e+00, -1.50069616e+00,  2.09784566e-01,
        1.65852066e+00, -1.85077616e+00, -1.38340981e+00,  6.09976684e-01,
       -2.24687603e-01,  2.06865852e-01,  1.22449154e-01, -1.84309617e+00,
       -1.74020344e-01, -6.69754550e-02, -6.32550070e-01, -1.71071657e+00,
        5.05213128e-01, -4.33899989e-01, -2.29218725e-01,  1.02722110e+00,
        5.36459136e-01,  

In [55]:
y_test

array([[ 1.87263841e+00],
       [ 1.43907530e+00],
       [-9.20018082e-01],
       [ 2.14897113e-01],
       [ 1.87263841e+00],
       [ 2.21273041e-01],
       [ 3.04160106e-01],
       [ 4.06174955e-01],
       [ 6.22956509e-01],
       [-4.92830902e-01],
       [ 1.51558644e+00],
       [-1.29403002e-01],
       [-4.48199405e-01],
       [-8.37131017e-01],
       [-7.92499520e-01],
       [ 1.29242896e+00],
       [-1.28344598e+00],
       [-8.88138441e-01],
       [-1.27707005e+00],
       [-9.75233619e-02],
       [ 1.47733087e+00],
       [ 1.87263841e+00],
       [-5.75717966e-01],
       [-1.93162283e-01],
       [-5.56590182e-01],
       [-7.35116168e-01],
       [-3.33432700e-01],
       [-4.80079045e-01],
       [ 5.46445372e-01],
       [-3.52560484e-01],
       [ 6.82507677e-02],
       [ 1.78337542e+00],
       [-9.32769938e-01],
       [ 1.76424763e+00],
       [-1.70425723e+00],
       [ 2.46776753e-01],
       [ 1.58572165e+00],
       [-2.06130920e+00],
       [-1.3

In [57]:
# Need to wrap my head around this (where's the predictor)
# https://towardsdatascience.com/machine-learning-basics-support-vector-regression-660306ac5226
print("accuracy score:", regressor.score(X_test, y_test))

accuracy score: 0.9884879487662313
