## Importing the Data

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# !gunzip '/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/GoogleNews-vectors-negative300.bin.gz'

In [0]:
# Constants
DATASET_DIR = '/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/data/'
SAVE_DIR = './'

import os
import pandas as pd
from sklearn.model_selection import train_test_split
  
X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1', 'domain1_score', 'essay_id'])
X, split_X_test, y, split_y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
import nltk
nltk.download('stopwords')

# **Helper Functions to Process Data**




In [0]:
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec

def getWords(essay_v, remove_stopwords):
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def getSentences(essay_v, remove_stopwords):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(getWords(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

## LSTM model 

We utilise a 3-layer LSTM model with relu as the activation function. This is because the scores are not normalised and do not range between 0 and 1

In [0]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K

def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(200, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model

# Training

Now we train the model on the dataset.

We will use 5-Fold Cross Validation and measure the Quadratic Weighted Kappa for each fold.
We will then calculate Average Kappa for all the folds.

Importing pretrained word2vec model

In [0]:
# import gensimts/NLP/automated_essay_grading/GoogleNews-vectors-negative300.bin', binary=True)  
# word2Vec_model = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/My Drive/Projec

In [0]:
!pip install tensorflow-gpu

In [0]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt

nltk.download('punkt')
cv = KFold(n_splits = 4, shuffle = True)
results = []
y_pred_list = []

count = 1

max_result = 0.0
collected_mae = []
collected_epoch_results = []
collected_loss_history = []

for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
            sentences += getSentences(essay, remove_stopwords = True)
            
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    word2Vec_model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    word2Vec_model.init_sims(replace=True)
    word2Vec_model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

    clean_train_essays = []
    
    for essay_v in train_essays:
        clean_train_essays.append(getWords(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, word2Vec_model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(getWords( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, word2Vec_model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    print(trainDataVecs.shape)

    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    
    epochs = 50
    epoch_results = []
    loss_history = []
    mae = []
    for i in range(0,epochs):
      history = lstm_model.fit(trainDataVecs, y_train, batch_size=32)
      y_pred1 = lstm_model.predict(testDataVecs)
      y_pred1 = np.around(y_pred1)
      result = cohen_kappa_score(y_test.values,y_pred1,weights='quadratic')
      loss_history.append(history.history['loss'])
      epoch_results.append(result)
      mean_error = history.history['mae']
      mae.append(mean_error)

    collected_epoch_results.append(epoch_results)
    collected_loss_history.append(loss_history)
    collected_mae.append(mae)
    
    loss_train = history.history['loss']
    
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    y_pred = np.around(y_pred)
    
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')

    if max_result < result :
      lstm_model.save('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/model_weights/final_lstm.h5')
      max_result = result

    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1
    # break

In [0]:
print("Average Kappa score after a 5-fold cross validation: ",np.around(np.array(results).mean(),decimals=4))

In [0]:
# Plotting graphs for loss of every epoch for each fold (4 folds)
import matplotlib.pyplot as plt

plt.plot(collected_loss_history[0], c='r', label='loss_fold1')
plt.plot(collected_loss_history[1], c='b', label='loss_fold2')
plt.plot(collected_loss_history[2], c='y', label='loss_fold3')
plt.plot(collected_loss_history[3], c='g', label='loss_fold4')
plt.legend()
plt.xlabel("epochs")
plt.ylabel("training_loss")
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/Word2Vec/outputs/300d/training_loss_vs_epoch.png')

In [0]:
# Plotting graphs for loss of every epoch for each fold (4 folds)

plt.plot(collected_mae[0], c='r', label='mae_fold1')
plt.plot(collected_mae[1], c='b', label='mae_fold2')
plt.plot(collected_mae[2], c='y', label='mae_fold3')
plt.plot(collected_mae[3], c='g', label='mae_fold4')
plt.legend()
plt.xlabel("epochs")
plt.ylabel("mean_squared_error")
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/Word2Vec/outputs/300d/mean_squared_error_vs_epoch.png')

In [0]:
# Plotting graphs for kappa results of every epoch for each fold (4 folds)
import matplotlib.pyplot as plt

plt.plot(collected_epoch_results[0], c='r', label='kappascore_fold1')
plt.plot(collected_epoch_results[1], c='b', label='kappascore_fold2')
plt.plot(collected_epoch_results[2], c='y', label='kappascore_fold3')
plt.plot(collected_epoch_results[3], c='g', label='kappascore_fold4')
plt.legend()
plt.xlabel("epochs")
plt.ylabel("kappa_score")
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/Word2Vec/outputs/300d/kappa_scores_vs_epoch.png')

In [0]:
import matplotlib.pyplot as plt

loss_arr = history.history['loss']
plt.plot(loss_arr, c='b', label='Loss')
plt.legend()
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/outputs/validationLoss_vs_epoch.png')

plt.figure()
mae_arr = history.history['mae']
plt.plot(mae_arr, c='r', label='mae')
plt.legend()
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/outputs/mae_vs_epoch.png')

plt.figure()
plt.plot(loss_arr, c='b', label='Loss')
plt.plot(mae_arr, c='r', label='mae')
plt.legend()
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/outputs/mae_val_loss_vs_epoch.png')

In [0]:

lstm_model_final = get_model()
lstm_model_final.load_weights('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/model_weights/final_lstm.h5')

# trainSet = pd.read_csv(os.path.join(DATASET_DIR, 'test_set.tsv'), sep='\t', encoding='ISO-8859-1')

# Generate training and testing data word vectors.
test_essays = split_X_test['essay']
clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(getWords( essay_v, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_essays, word2Vec_model, num_features )

testDataVecs = np.array(testDataVecs)
# Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

split_y_pred = lstm_model.predict(testDataVecs)

# Round y_pred to the nearest integer.
split_y_pred = np.around(split_y_pred)

finalResult = cohen_kappa_score(split_y_test.values,split_y_pred,weights='quadratic')

In [0]:
print(finalResult)