## Imports packages

In [64]:
#For  Colab 
#from google.colab import drive
#drive.mount('/content/drive')


In [65]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [66]:
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings

import re 
import nltk                                         #Natural language processing tool-kit
from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer

#from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
#from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from gensim.models import Word2Vec                                   #For Word2Vec

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, cohen_kappa_score, accuracy_score


## Imports data

In [67]:
#Local
df = pd.read_csv("../data/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')

#Colab
#df = pd.read_csv("/content/drive/MyDrive/Automated_Essay_Scoring/data/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')


df = df.dropna(axis=1)
df = df.drop(columns=['rater1_domain1', 'rater2_domain1'])
df = df.drop(columns=['essay_id'])
df.head()

Unnamed: 0,essay_set,essay,domain1_score
0,1,"Dear local newspaper, I think effects computer...",8
1,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,1,"Dear @LOCATION1, I know having computers has a...",8


In [68]:
#df['essay'][1]

In [69]:
#we rescale to have all scores to be in the same interval
for i in range(len(df)):
    if df['essay_set'][i] == 1:
        df['domain1_score'][i] = df['domain1_score'][i] - 2
    elif df['essay_set'][i] == 2:
        df['domain1_score'][i] = (df['domain1_score'][i] - 1)*2
    elif df['essay_set'][i] == 3:
        df['domain1_score'][i] = (df['domain1_score'][i])*10/3
    elif df['essay_set'][i] == 4:
        df['domain1_score'][i] = (df['domain1_score'][i])*10/3
    elif df['essay_set'][i] == 5:
        df['domain1_score'][i] = (df['domain1_score'][i])*10/4
    elif df['essay_set'][i] == 6:
        df['domain1_score'][i] = (df['domain1_score'][i])*10/4
    elif df['essay_set'][i] == 7:
        df['domain1_score'][i] = (df['domain1_score'][i])/3
    elif df['essay_set'][i] == 8:
        df['domain1_score'][i] = (df['domain1_score'][i])/6

## Text pre-treatment

In [70]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
len(stop_words) #finding stop words

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/GuillaumeKunsch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [71]:
snow = nltk.stem.SnowballStemmer('english')

corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['essay'][i])
    review = review.lower()
    review = review.split()
    
    review = [snow.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [72]:
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
type(onehot_repr)

list

In [73]:
sent_length=400
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...  492 3258 4761]
 [   0    0    0 ... 3500 1051 1707]
 [   0    0    0 ... 1648 4919 2295]
 ...
 [3162 1714 4364 ... 2897 3757 1911]
 [   0    0    0 ... 4826 2098 4811]
 [   0    0    0 ... 3162 2433 2922]]


## NN architecture

In [74]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/GuillaumeKunsch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [75]:
def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model.wv[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [76]:
def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['accuracy','mae'])
    model.summary()

    return model

In [77]:
X=df
y = X['domain1_score']

In [78]:
cv = KFold(n_splits = 5, shuffle = True)
mses = []
cohen_kappa_scores = []
accuracies = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
            
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []
    
    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=50)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    # Save any one of the 5 models.
    if count == 5:
         lstm_model.save('./final_lstm.h5')
    
    # Round y_pred to the nearest integer.
    y_pred_list.append(y_pred)
    y_pred_round = np.around(y_pred)
    
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    mse_score = mean_squared_error(y_test.values, y_pred)
    ck_score = cohen_kappa_score(np.around(y_test.values),y_pred_round,weights='quadratic')
    acc = accuracy_score(np.around(y_test.values),y_pred_round)

    print("MSE Score: {}".format(mse_score))
    print("Acc Score: {}".format(acc))
    print("Kappa Score: {}".format(ck_score))
    mses.append(mse_score)
    accuracies.append(acc)
    cohen_kappa_scores.append(ck_score)

    count += 1



--------Fold 1--------

Training Word2Vec Model...
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 1, 300)            721200    
                                                                 
 lstm_5 (LSTM)               (None, 64)                93440     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch

In [79]:
print("Average MSE score after a 5-fold cross validation: ",np.around(np.array(mses).mean(),decimals=4))
print("Average Accuracy score after a 5-fold cross validation: ",np.around(np.array(accuracies).mean(),decimals=4))
print("Average Kappa score after a 5-fold cross validation: ",np.around(np.array(cohen_kappa_scores).mean(),decimals=4))

Average MSE score after a 5-fold cross validation:  3.2903
Average Accuracy score after a 5-fold cross validation:  0.2313
Average Kappa score after a 5-fold cross validation:  0.5402
