## Importing the Data

In [0]:
!pip install tensorflow-gpu

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# !gunzip '/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/GoogleNews-vectors-negative300.bin.gz'

In [0]:
# Constants
DATASET_DIR = '/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/data/'

import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
minimum_scores = [-1, 2, 1, 0, 0, 0, 0, 0, 0]
maximum_scores = [-1, 12, 6, 3, 3, 4, 4, 30, 60]
new_y = []
for i in X.index:
  val = X.iloc[i]
  min_score = minimum_scores[int(val['essay_set'])]
  max_score = maximum_scores[int(val['essay_set'])]
  new_y.append((val['domain1_score'] - min_score)/(max_score - min_score))

new_y = np.array(new_y)
# new_y = np.around(new_y*10)
normalized_y = pd.Series(new_y)#, dtype = 'int32')

y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1', 'domain1_score', 'essay_id'])
X_train, X_test, y_train, y_test = train_test_split(X, normalized_y, test_size=0.2, random_state=42)

In [0]:
y_train

# Preprocessing the Data

Below are our implementations of various functions to tokenise, pad and generate the encoded train for word embeddings.

In [0]:
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import keras
from nltk.tokenize import RegexpTokenizer
from keras.preprocessing.sequence import  pad_sequences
from nltk import FreqDist
# import contractions

# nltk.download('punkt')
# nltk.download('stopwords')
# tokenizer = RegexpTokenizer(r'\w+')
# num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')
# vocab_size = 4000
# def is_number(token):
# 	return bool(num_regex.match(token))
 
# def tokenise(essay, remove_stopwords=False):
#   tokens = tokenizer.tokenize(contractions.fix(essay.lower()))
#   if remove_stopwords:
#     tokens =  [word for word in tokens if not word in stopwords.words()]
#   return tokens

# def create_vocab(essays, vocab_size=vocab_size):
#   total_tokens = []
#   for essay in essays:
#     tokens = tokenise(essay)
#     total_tokens += tokens
#   # print(len(total_tokens))
  
#   vocab = {'<pad>':0, '<unk>':1, '<num>':2}
#   vcb_len = len(vocab)
#   index = vcb_len
#   for word, _ in FreqDist(total_tokens).most_common():
#     # if word in stopwords.words():
#     #   continue
#     vocab[word] = index
#     index += 1
#     if index == vocab_size:
#       break
#   del total_tokens
#   return vocab

# def convert_data(essays, vocab, max_length=0):
#   encoded_essays = []
#   maxlen = 0
#   for essay in essays:
#     encoded_essay = []
#     tokens = tokenise(essay)
#     maxlen = max(maxlen, len(tokens))
#     for word in tokens:
#       if is_number(word):
#         encoded_essay.append(vocab['<num>'])
#       elif word in vocab:
#         encoded_essay.append(vocab[word])
#       else:
#         encoded_essay.append(vocab['<unk>'])
#     encoded_essays.append(encoded_essay)
#   if max_length == 0:
#     max_length = maxlen  
#   return pad_sequences(encoded_essays, maxlen=max_length, padding='post'), max_length

In [0]:
# vocab = create_vocab(X_train['essay'].values)

In [0]:
# emb_dim = 50
# embeddings = {}
# emb_file = open('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/En_vectors.txt')
# lines = emb_file.readlines()
# for line in lines:
#   tokens = line.split()
#   word = tokens[0]
#   vec = tokens[1].split(',')
#   embeddings[word] = vec

# def get_emb_matrix_given_vocab(vocab, embeddings):
#   counter = 0
#   emb_matrix = np.zeros((len(vocab), emb_dim))
#   for word, index in vocab.items():
#     try:
#       emb_matrix[index] = embeddings[word]
#       counter += 1
#     except KeyError:
#       pass
#   print(counter, len(vocab), 100*counter/len(vocab))
#   return emb_matrix
# emb_matrix = get_emb_matrix_given_vocab(vocab, embeddings)

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import  pad_sequences
train_essays = X_train['essay']
t = Tokenizer(num_words=4000)
t.fit_on_texts(list(train_essays))

encoded_train = t.texts_to_sequences(list(train_essays))
max_length = 550
emb_dim = 50
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
vocab_size = 4000

test_essays = X_test['essay']
encoded_test = t.texts_to_sequences(list(test_essays))
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

In [0]:
def mean_quadratic_weighted_kappa(kappas, weights=None):
    """
    Calculates the mean of the quadratic
    weighted kappas after applying Fisher's r-to-z transform, which is
    approximately a variance-stabilizing transformation.  This
    transformation is undefined if one of the kappas is 1.0, so all kappa
    values are capped in the range (-0.999, 0.999).  The reverse
    transformation is then applied before returning the result.
    
    mean_quadratic_weighted_kappa(kappas), where kappas is a vector of
    kappa values
    mean_quadratic_weighted_kappa(kappas, weights), where weights is a vector
    of weights that is the same size as kappas.  Weights are applied in the
    z-space
    """
    kappas = np.array(kappas, dtype=float)
    if weights is None:
        weights = np.ones(np.shape(kappas))
    else:
        weights = weights / np.mean(weights)

    # ensure that kappas are in the range [-.999, .999]
    kappas = np.array([min(x, .999) for x in kappas])
    kappas = np.array([max(x, -.999) for x in kappas])
    
    z = 0.5 * np.log( (1+kappas)/(1-kappas) ) * weights
    z = np.mean(z)
    kappa = (np.exp(2*z)-1) / (np.exp(2*z)+1)
    return kappa

In [0]:
def evaluatePreds(y_test, y_pred, X_test):
  print('*'*100)
  print("Calculating Kappa Scores....")
  new_y_test = np.copy(y_test.values)
  new_y_pred = np.squeeze(np.copy(y_pred))
  for i, e_set in enumerate(X_test['essay_set'].values):
    min_score = minimum_scores[int(e_set)]
    max_score = maximum_scores[int(e_set)]
    new_y_test[i] = new_y_test[i]*(max_score - min_score) + min_score
    new_y_pred[i] = new_y_pred[i]*(max_score - min_score) + min_score
  new_y_pred = np.around(new_y_pred)
  new_y_test = np.around(new_y_test)
  finalResult = []
  for i in range(1,9):
    finalResult.append(cohen_kappa_score(new_y_test[X_test['essay_set'] == i],new_y_pred[X_test['essay_set'] == i],weights='quadratic'))
  print(finalResult)
  print("Average QWK (The one that matters):", mean_quadratic_weighted_kappa(finalResult))
  print('*'*100)
  return finalResult

In [0]:
from sklearn.metrics import cohen_kappa_score
from datetime import datetime
import os
import csv

class CheckKappa(keras.callbacks.Callback):
    def __init__(self, interval=10, custom_filename=None, is_continue=False, continue_from=None):
        super(keras.callbacks.Callback, self).__init__()
        self.interval = interval
        self.custom_filename = custom_filename
        self.is_continue = is_continue
        self.continue_from = continue_from

    def on_train_begin(self, logs={}):
        self.scores = []
        self.losses = []
        self.filename = ""
        dir_path = "/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/train_log/"
        if not os.path.exists(dir_path):
          os.makedirs(dir_path)
        model_name = "seq" + str(max_length) + "_emb" + str(emb_dim)
        if self.custom_filename is None:
          self.filename = os.path.join(dir_path, "train_" + datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p_")) + model_name + '.csv'
        else:
          self.filename = os.path.join(dir_path, self.custom_filename)

        self.fieldnames = ['Epoch', 'Train_Loss', 'mae', 'Score']

        if not self.is_continue:
          with open(self.filename, 'a') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames)
            writer.writeheader()
        else:
          log = pd.read_csv(self.filename)
          if self.continue_from is not None:
            log[log['Epoch'] <= self.continue_from].to_csv(self.filename, index=False)
            print("Continuing Training after Epoch {0}".format(self.continue_from))

    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))
        self.scores.append(-1)
        cur_epoch = epoch
        if self.continue_from is not None:
          cur_epoch += self.continue_from
        row = {"Epoch": cur_epoch + 1, "Train_Loss": logs.get('loss'), "mae": logs.get('mae'), "Score": np.nan}

        if (cur_epoch + 1) % self.interval == 0:
          y_pred = self.model.predict(padded_test)
          finalResult = evaluatePreds(y_test, y_pred, X_test)
          row["Score"] = mean_quadratic_weighted_kappa(finalResult)
          self.scores[-1] = mean_quadratic_weighted_kappa(finalResult)

        with open(self.filename , 'a') as csvfile:
          writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames)
          writer.writerow(row)

        return

## Defining the model 

In [0]:
pretrained_embeddings = False

In [0]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K
from keras.optimizers import RMSprop
from keras.initializers import Constant
def get_model():
    """Define the model."""
    model = Sequential()
    if pretrained_embeddings:
      model.add(Embedding(vocab_size, emb_dim, mask_zero=True, embeddings_initializer=Constant(emb_matrix)))
    else:
      model.add(Embedding(vocab_size, emb_dim, mask_zero=True))
    model.add(LSTM(300, dropout=0.5, recurrent_dropout=0.1, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer=RMSprop(lr=1e-3), metrics=['mae'])
    model.summary()

    return model

## Training Phase

In [0]:
# train_essays, max_length = convert_data(X_train['essay'].values, vocab, max_length=250)

In [0]:
# Check Existing Logs
# !ls "/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/train_log/"

In [0]:
# Clear Logs
# !rm "/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/train_log/"*
# !rm "/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/lstm_model_epoch"*

In [0]:
# check saved models
# !ls "/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/"

emb200_seq500.h5	lstm_model_epoch-25.h5	lstm_model_epoch-65.h5
emb300_seq500.h5	lstm_model_epoch-30.h5	lstm_model_epoch-70.h5
emb50_seq500.h5		lstm_model_epoch-35.h5	lstm_model_epoch-75.h5
final_lstm.h5		lstm_model_epoch-40.h5	lstm_model_epoch-80.h5
lstm_model_epoch-05.h5	lstm_model_epoch-45.h5	lstm_model_epoch-85.h5
lstm_model_epoch-10.h5	lstm_model_epoch-50.h5	lstm_model_epoch-90.h5
lstm_model_epoch-15.h5	lstm_model_epoch-55.h5
lstm_model_epoch-20.h5	lstm_model_epoch-60.h5


In [0]:
# copy saved models
# !cp "/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/lstm_model_epoch-100.hf5" "/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/emb200_seq500.h5"

In [0]:
from keras.callbacks import ModelCheckpoint

filepath="/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/lstm_model_epoch-{epoch:02d}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='train_loss', verbose=1, period=5)
checkpoint_log = CheckKappa(interval=5)       # If resuming training, custom_filename= <Existing Log name> and is_continue=True
callbacks_list = [checkpoint_log, checkpoint]
lstm_model = get_model()
# lstm_model = load_model('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/lstm_model_epoch-90.h5')
history = lstm_model.fit(padded_train, y_train, batch_size=64, epochs=100, callbacks=callbacks_list)

# lstm_model.save('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/final_lstm.h5')

In [0]:
# Plot Train Loss, mae and Score
import matplotlib.pyplot as plt
log = pd.read_csv("/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/train_logs/" + "train_07-05-2020_08-45-54_AM_seq500_emb50.csv")
plt.plot(log['Epoch'], log['Train_Loss'], c='b', label='loss')
plt.plot(log['Epoch'], log['mae'], c='r', label='mae')
plt.legend()
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/embedding/outputs/300d_runs/swap.png')

In [0]:
plt.figure()
plt.plot(log.dropna()['Epoch'], log.dropna()['Score'], label='QWK score')
plt.legend()
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/embedding/outputs/300d_runs/swap_result.png')

In [0]:
log[log['Epoch']==70]

In [0]:
lstm_model_final = load_model('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/emb50_seq500.h5')

# Generate training and testing data word vectors.

split_y_pred = lstm_model_final.predict(padded_test)

In [0]:
kappas = evaluatePreds(y_test, split_y_pred, X_test)

In [0]:
# !cp '/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/lstm_model_epoch-80.h5' '/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/saved_models/emb300_seq500.h5'

In [0]:
import matplotlib.pyplot as plt

loss_arr = history.history['loss']
plt.plot(loss_arr, c='b', label='Loss')
plt.legend()
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/outputs/validationLoss_vs_epoch_1_swap.png')

plt.figure()
mae_arr = history.history['mae']
plt.plot(mae_arr, c='r', label='mae')
plt.legend()
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/outputs/mae_vs_epoch_1_swap.png')

plt.figure()
plt.plot(loss_arr, c='b', label='Loss')
plt.plot(mae_arr, c='r', label='mae')
plt.legend()
plt.savefig('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/outputs/mae_val_loss_vs_epoch_1_swap.png')

In [0]:
import pickle

def save_obj(obj, name ):
    with open('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/history/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('/content/gdrive/My Drive/Projects/NLP/automated_essay_grading/history/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [0]:
save_obj(history.history, "history_dict_1-100_swap")

In [0]:
mean_quadratic_weighted_kappa(kappas)

Original Code: https://github.com/nusnlp/nea/blob/master/nea/models.py