# M - Automated Essay Scoring
_School of Information Technology_<br>
_Monash University Malaysia_<br>
(c) Copyright 2020, Ian Tan & Jun Qing Lim

Steps

- Import libraries
- Read dataset (ASAP)
- Extract features (into file) using EASE
- Conduct machine learning (Sci-kit Learn libraries)
- Evaluate (QWK)

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm #SVR is in SVM
from sklearn.metrics import accuracy_score, confusion_matrix

#### Import the EASE functions, which is located in the ease folder.

In [2]:
import sys
sys.path.insert(1, 'ease')
import create
import grade 
import model_creator 
import predictor_extractor 
import predictor_set 
import util_functions
import essay_set
import feature_extractor

from essay_set import EssaySet
from feature_extractor import FeatureExtractor

## Read Dataset

AES (Hewlett Foundation dataset from Kaggle) in the folder "aes"

In [3]:
train_set = pd.read_csv("asap-aes/training_set_rel3.tsv", sep='\t', encoding="latin-1")
#train_set.head()

In [4]:
# Am filtering just for this current exercise.
# Set 2 has 1,800 essays, sufficient for current work
train_set = train_set[train_set['essay_set'] == 2]  # filter for set 2
#train_set.head()

In [5]:
train_set = train_set.reset_index() # resets index
#train_set.head()

In [6]:
train_set['essay'] = [entry.lower() for entry in train_set['essay']] # lower case for all words in essay
#train_set.head()

In [7]:
essays = train_set['essay']
scores = train_set['domain1_score']

In [8]:
scores.columns = "score"

#### Create the essay sets

In [9]:
# This can take some time, be patient :-)
e_set = EssaySet()

for i in range(len(essays)):
    e_set.add_essay(essays[i], scores[i])

## Extract Features

In [10]:
f_extractor = FeatureExtractor()

In [11]:
length = f_extractor.gen_length_feats(e_set)
length_df = pd.DataFrame(
    length, 
    columns = [
        'chars', 
        'words', 
        'commas', 
        'apostrophes', 
        'punctuations', 
        'avg_word_length', 
        'POS', 
        'POS/total_words'
    ]
)

#### Collate the essay prompts
This consist of one essay from each set

In [12]:
essay_prompts = []

# Takes a bit of time also :)
for i in range(1,9):
    file = "prompts/set" + str(i) + ".txt"
    f = open(file, "r", encoding="latin-1") # there are some 0x9x characters, hence need to specify encoding
    essay_prompts.append(f.read())
    
def get_essay_prompt(essay_set):
    return essay_prompts[essay_set-1]

In [13]:
# Unsure how this works
e_set.update_prompt(get_essay_prompt(2))

# Need more explanation on how this works - look into EASE

prompts = f_extractor.gen_prompt_feats(e_set)
prompts_df = pd.DataFrame(prompts, columns = ['prompt_words', 'prompt_words/total_words', 'synonym_words', 'synonym_words/total_words'])

In [14]:
e_set

<essay_set.EssaySet at 0x24020448860>

In [15]:
# Another process that takes sometime to process
unstemmed = util_functions.get_vocab_essays_count(e_set._text, e_set._score)
stemmed = util_functions.get_vocab_essays_count(e_set._clean_stem_text, e_set._score)

bow = list(map(lambda a,b:[a,b], unstemmed, stemmed))
bow_df = pd.DataFrame(bow, columns = ['unstemmed', 'stemmed'])

In [16]:
features = pd.concat([length_df, prompts_df, bow_df], axis=1, sort=False)

In [17]:
features.head()

Unnamed: 0,chars,words,commas,apostrophes,punctuations,avg_word_length,POS,POS/total_words,prompt_words,prompt_words/total_words,synonym_words,synonym_words/total_words,unstemmed,stemmed
0,2639.0,527.0,15.0,13.0,21.0,5.00759,524.330784,0.994935,220.0,0.417457,112.0,0.212524,584,559
1,841.0,180.0,5.0,2.0,3.0,4.672222,178.6629,0.992572,82.0,0.455556,66.0,0.366667,210,210
2,1181.0,261.0,12.0,15.0,14.0,4.524904,257.992218,0.988476,144.0,0.551724,83.0,0.318008,291,285
3,2705.0,527.0,22.0,6.0,31.0,5.132827,521.65392,0.989856,245.0,0.464896,131.0,0.248577,547,528
4,2394.0,501.0,25.0,15.0,34.0,4.778443,484.298031,0.966663,216.0,0.431138,117.0,0.233533,591,562


In [18]:
# Export features to a file for next stage (optional)
dataset = features.merge(scores, left_index=True, right_index=True)

In [19]:
dataset.head()

Unnamed: 0,chars,words,commas,apostrophes,punctuations,avg_word_length,POS,POS/total_words,prompt_words,prompt_words/total_words,synonym_words,synonym_words/total_words,unstemmed,stemmed,domain1_score
0,2639.0,527.0,15.0,13.0,21.0,5.00759,524.330784,0.994935,220.0,0.417457,112.0,0.212524,584,559,4
1,841.0,180.0,5.0,2.0,3.0,4.672222,178.6629,0.992572,82.0,0.455556,66.0,0.366667,210,210,1
2,1181.0,261.0,12.0,15.0,14.0,4.524904,257.992218,0.988476,144.0,0.551724,83.0,0.318008,291,285,2
3,2705.0,527.0,22.0,6.0,31.0,5.132827,521.65392,0.989856,245.0,0.464896,131.0,0.248577,547,528,4
4,2394.0,501.0,25.0,15.0,34.0,4.778443,484.298031,0.966663,216.0,0.431138,117.0,0.233533,591,562,4


In [20]:
dataset.columns = ['chars', 'words', 'commas', 'apostrophes', 'punctuations',
       'avg_word_length', 'POS', 'POS/total_words', 'prompt_words',
       'prompt_words/total_words', 'synonym_words',
       'synonym_words/total_words', 'unstemmed', 'stemmed', 'score']

In [21]:
dataset.head()

Unnamed: 0,chars,words,commas,apostrophes,punctuations,avg_word_length,POS,POS/total_words,prompt_words,prompt_words/total_words,synonym_words,synonym_words/total_words,unstemmed,stemmed,score
0,2639.0,527.0,15.0,13.0,21.0,5.00759,524.330784,0.994935,220.0,0.417457,112.0,0.212524,584,559,4
1,841.0,180.0,5.0,2.0,3.0,4.672222,178.6629,0.992572,82.0,0.455556,66.0,0.366667,210,210,1
2,1181.0,261.0,12.0,15.0,14.0,4.524904,257.992218,0.988476,144.0,0.551724,83.0,0.318008,291,285,2
3,2705.0,527.0,22.0,6.0,31.0,5.132827,521.65392,0.989856,245.0,0.464896,131.0,0.248577,547,528,4
4,2394.0,501.0,25.0,15.0,34.0,4.778443,484.298031,0.966663,216.0,0.431138,117.0,0.233533,591,562,4


In [22]:
dataset.to_csv('maes_features.csv')

Can just use the features and score for the X and y but just to keep to certain convention if reading back from the CSV file above.


In [23]:
X = dataset.iloc[:,0:13].values.astype(float)
y = dataset.iloc[:,14].values.astype(float)

In [24]:
y

array([4., 1., 2., ..., 2., 3., 3.])

In [25]:
X.shape

(1800, 13)

In [26]:
y = np.array(y).reshape(-1,1)
y.shape

(1800, 1)

#### Conduct Feature Scaling

In [27]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

In [28]:
len(X)

1800

In [29]:
len(y)

1800

#### Split the train and test sets

In [30]:
# To split the train / test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Have a look at the first few lines
print(y_test[:5, :])

[[-0.53668756]
 [ 2.04630071]
 [-0.53668756]
 [ 0.75480657]
 [ 0.75480657]]


#### Conduct the fit

In [31]:
from sklearn.svm import SVR
# most important SVR parameter is Kernel type. It can be #linear,polynomial or gaussian SVR. We have a non-linear condition #so we can select polynomial or gaussian but here we select RBF(a #gaussian type) kernel.
# kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
# maybe use poly and increase the degree
regressor = SVR(kernel='rbf', gamma='auto', verbose=True)
regressor.fit(X_train,y_train.ravel())

[LibSVM]

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=True)

#### Test / Predict the fit

In [32]:
# Not used yet as I don't have a sample X
y_pred = regressor.predict(X_test)
y_pred = sc_y.inverse_transform(y_pred).round()

In [33]:
df = pd.DataFrame(
    {
        'Real Values':sc_y.inverse_transform(y_test.reshape(-1)),
        'Predicted Values':y_pred
    }
)
df.head(10)

Unnamed: 0,Real Values,Predicted Values
0,3.0,4.0
1,5.0,4.0
2,3.0,3.0
3,4.0,3.0
4,4.0,4.0
5,4.0,3.0
6,4.0,4.0
7,3.0,3.0
8,4.0,4.0
9,3.0,3.0


#### Accuracy Score

In [34]:
y_pred

array([4., 4., 3., 3., 4., 3., 4., 3., 4., 3., 4., 4., 3., 3., 3., 4., 3.,
       3., 3., 3., 4., 4., 3., 3., 3., 3., 3., 3., 4., 3., 4., 4., 3., 4.,
       3., 4., 4., 1., 3., 4., 3., 4., 4., 2., 3., 3., 3., 2., 4., 4., 3.,
       4., 4., 3., 3., 2., 3., 4., 3., 4., 3., 5., 3., 2., 4., 3., 3., 3.,
       4., 3., 3., 4., 3., 3., 4., 3., 4., 4., 4., 3., 4., 2., 4., 3., 4.,
       3., 3., 4., 3., 4., 4., 3., 3., 3., 4., 4., 3., 2., 3., 3., 3., 3.,
       4., 3., 3., 4., 4., 3., 3., 3., 4., 3., 4., 3., 3., 4., 4., 4., 4.,
       4., 3., 3., 3., 4., 3., 2., 3., 2., 4., 3., 3., 3., 4., 4., 3., 4.,
       4., 4., 3., 4., 3., 3., 3., 4., 3., 3., 4., 4., 4., 3., 4., 3., 4.,
       4., 3., 4., 3., 4., 4., 3., 3., 3., 3., 3., 3., 4., 3., 4., 3., 4.,
       4., 3., 3., 4., 4., 3., 3., 4., 3., 2., 3., 4., 2., 3., 3., 3., 4.,
       3., 4., 3., 4., 2., 3., 3., 4., 4., 3., 4., 3., 4., 4., 4., 4., 3.,
       4., 4., 3., 4., 2., 4., 3., 4., 3., 3., 3., 4., 3., 3., 4., 4., 4.,
       4., 4., 3., 4., 3.

In [35]:
y_test = sc_y.inverse_transform(y_test).round()
y_test.ravel()

array([3., 5., 3., 4., 4., 4., 4., 3., 4., 3., 4., 3., 4., 2., 3., 5., 3.,
       3., 3., 4., 4., 4., 4., 4., 3., 3., 3., 4., 3., 3., 3., 4., 3., 4.,
       1., 3., 4., 1., 2., 4., 3., 3., 4., 2., 3., 3., 3., 1., 4., 4., 4.,
       4., 4., 3., 3., 3., 4., 3., 3., 4., 3., 5., 3., 2., 3., 4., 3., 4.,
       4., 3., 4., 3., 4., 4., 3., 3., 4., 4., 4., 3., 3., 2., 4., 3., 4.,
       3., 3., 4., 3., 4., 4., 4., 3., 4., 3., 4., 3., 1., 3., 3., 4., 4.,
       4., 4., 3., 2., 4., 2., 3., 2., 4., 2., 4., 3., 3., 4., 4., 3., 4.,
       4., 3., 3., 3., 5., 2., 3., 3., 2., 5., 3., 3., 3., 3., 4., 3., 3.,
       4., 4., 3., 3., 3., 3., 3., 5., 3., 4., 4., 4., 3., 3., 4., 2., 4.,
       4., 2., 4., 2., 4., 4., 3., 2., 3., 3., 3., 4., 4., 3., 4., 4., 3.,
       4., 3., 3., 4., 4., 4., 3., 3., 3., 1., 4., 5., 3., 4., 3., 4., 4.,
       3., 4., 4., 3., 1., 3., 4., 3., 4., 4., 4., 4., 4., 4., 3., 4., 4.,
       3., 4., 3., 4., 2., 4., 3., 5., 4., 3., 3., 4., 3., 4., 4., 3., 5.,
       3., 4., 4., 4., 4.

In [37]:
# Need to wrap my head around this (where's the predictor)
# https://towardsdatascience.com/machine-learning-basics-support-vector-regression-660306ac5226
print("accuracy score:", regressor.score(X_test, y_pred))

accuracy score: -31.814631081200737


# End

In [39]:
print("accuracy score:", accuracy_score(df['Real Values'], df['Predicted Values']))

accuracy score: 0.6388888888888888
