# Data preprocessing

## Initialization

In [5]:
# Import relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import nltk
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader
import torch
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader
import torch.nn.functional as F 
from operator import itemgetter
from sklearn.decomposition import PCA

In [6]:
# Import data set
data = pd.read_excel('data/training_set_rel3.xlsx')

## Convert resolved grades to percentages

In [7]:
# Create grade columns
data['grade'] = np.nan

# Essay set #1: 2-12 grade range
data.loc[data.essay_set == 1, 'grade'] = (data.loc[data.essay_set == 1, 'domain1_score'] - 2) / 10

# Essay set #2: 2 domains with 1-6 and 1-4 grade ranges respectively
data.loc[data.essay_set == 2, 'grade'] = ((data.loc[data.essay_set == 2, 'domain1_score'] - 1) + (data.loc[data.essay_set == 2, 'domain2_score'] - 1)) / 8

# Essay set #3, #4: 0-3 grade range
data.loc[data.essay_set == 3, 'grade'] = data.loc[data.essay_set == 3, 'domain1_score'] / 3
data.loc[data.essay_set == 4, 'grade'] = data.loc[data.essay_set == 4, 'domain1_score'] / 3

# Essay set #5, #6: 0-4 grade range
data.loc[data.essay_set == 5, 'grade'] = data.loc[data.essay_set == 5, 'domain1_score'] / 4
data.loc[data.essay_set == 6, 'grade'] = data.loc[data.essay_set == 6, 'domain1_score'] / 4

# Essay set #7: 0-30 grade range
data.loc[data.essay_set == 7, 'grade'] = data.loc[data.essay_set == 7, 'domain1_score'] / 30

# Essay set #8: 0-60 grade range
data.loc[data.essay_set == 8, 'grade'] = data.loc[data.essay_set == 8, 'domain1_score'] / 60

# Remove ungraded essays
data = data[~data.grade.isnull()].reset_index(drop = True)

## Convert essays to word embeddings sequences

### Word2Vec

In [8]:
# Download pre-trained Word2Vec model
w2v = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=int(1e5))

In [9]:
# Use PCA to reduce the dimensionality of word embeddings
target_dim = 50
dictionary = list(w2v.key_to_index.keys())
emb_matrix = np.zeros((len(dictionary), 300))

for i, word in enumerate(dictionary):
    emb_matrix[i,:] = w2v[word]

pca = PCA(n_components=target_dim, random_state=0)
emb_matrix_pca = pca.fit_transform(emb_matrix)

w2v_pca = {}
for i, word in enumerate(dictionary):
    w2v_pca[word] = emb_matrix_pca[i,:]

In [10]:
# Create list of embedding matrices
emb_essays_w2v = []

for essay in tqdm(data.essay):
    sentences = nltk.sent_tokenize(essay)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    emb_essay = []

    for sentence in tokenized_sentences:
        for word in sentence[:-1]:
            try:
                emb_essay.append(w2v_pca[word])
            except:
                if word != ',':
                    emb_essay.append(np.zeros(target_dim))
                else:
                    continue
                    
        emb_essay.append(w2v_pca['</s>'])
    
    emb_essays_w2v.append(torch.tensor(np.array(emb_essay).T).float())

100%|██████████| 12977/12977 [00:18<00:00, 707.35it/s] 


In [11]:
# Max essay length
essays_lengths_w2v = [mat.shape[1] for mat in emb_essays_w2v]
max_essay_length_w2v = np.max(essays_lengths_w2v)
max_essay_length_w2v

1179

### GloVe

In [12]:
# Download pretrained GloVe model
glove = gensim.downloader.load('glove-wiki-gigaword-50')

In [27]:
# Create list of embedding matrices
emb_essays_glove = []
tokenizer = RegexpTokenizer(r"\w+|[^\w\s]")

for essay in tqdm(data.essay):
    emb_essay = []
    tok_essay = tokenizer.tokenize(essay)

    for word in tok_essay:
        try:
            emb_essay.append(glove[word.lower()])
        except:
            emb_essay.append(np.zeros(50))
    
    emb_essays_glove.append(torch.tensor(np.array(emb_essay).T).float())

100%|██████████| 12977/12977 [00:04<00:00, 2859.10it/s]


In [31]:
# Max essay length
essays_lengths_glove = [mat.shape[1] for mat in emb_essays_glove]
max_essay_length_glove = np.max(essays_lengths_glove)
max_essay_length_glove

1266

In [32]:
# Max essay length (W2V abd GloVe)
max_essay_length = max(max_essay_length_w2v, max_essay_length_glove)
max_essay_length

1266

## Split data into training/validation/test subsets

In [35]:
# Convert essay_set variables and grades to list of tensors
essay_sets = []
grades = []

for i in tqdm(range(data.shape[0])):
    essay_sets.append(torch.tensor(data.essay_set[i]))
    grades.append(torch.FloatTensor(data[['grade']].iloc[i].values)[0])

100%|██████████| 12977/12977 [00:03<00:00, 3518.76it/s]


In [36]:
# Take a given fraction of data samples for each essay set
train_frac = 0.8
val_frac = 0.1
test_frac = 1 - val_frac - val_frac
train_idx = []
val_idx = []
test_idx = []

for i in range(1,9):
    df = data[data.essay_set == i].sample(frac = 1)
    n_samples = df.shape[0]
    n_train_samples = int(train_frac*n_samples)
    n_val_samples = int(val_frac*n_samples)
    train_idx += list(df.index)[:n_train_samples]
    val_idx += list(df.index)[n_train_samples:(n_train_samples+n_val_samples)]
    test_idx += list(df.index)[(n_train_samples+n_val_samples):]

In [37]:
# Split data into train/validation subsets
essays_w2v_train = list(itemgetter(*train_idx)(emb_essays_w2v))
essays_w2v_val = list(itemgetter(*val_idx)(emb_essays_w2v))
essays_w2v_test = list(itemgetter(*test_idx)(emb_essays_w2v))

essays_glove_train = list(itemgetter(*train_idx)(emb_essays_glove))
essays_glove_val = list(itemgetter(*val_idx)(emb_essays_glove))
essays_glove_test = list(itemgetter(*test_idx)(emb_essays_glove))

essay_sets_train = list(itemgetter(*train_idx)(essay_sets))
essay_sets_val = list(itemgetter(*val_idx)(essay_sets))
essay_sets_test = list(itemgetter(*test_idx)(essay_sets))

grades_train = list(itemgetter(*train_idx)(grades))
grades_val = list(itemgetter(*val_idx)(grades))
grades_test = list(itemgetter(*test_idx)(grades))

## Pad essays to set them all to the same dimension

In [61]:
# Training data
train_w2v, train_glove = [], []

print('Padding essays from training set...')
for i in tqdm(range(len(train_idx))):
    # W2V
    essay, essay_set, grade = essays_w2v_train[i], essay_sets_train[i], grades_train[i]
    essay_pad = F.pad(essay, (0, max_essay_length - essay.shape[1]))
    train_w2v.append(
        TensorDataset(essay_pad.view(1, essay_pad.shape[0], essay_pad.shape[1]), essay_set.view(1), grade.view(1)))
    
    # Glove
    essay = essays_glove_train[i]
    essay_pad = F.pad(essay, (0, max_essay_length - essay.shape[1]))
    train_glove.append(
        TensorDataset(essay_pad.view(1, essay_pad.shape[0], essay_pad.shape[1]), essay_set.view(1), grade.view(1)))

train_w2v, train_glove = ConcatDataset(train_w2v), ConcatDataset(train_glove)

# Validation data
val_w2v, val_glove = [], []

print('Padding essays from validation set...')
for i in tqdm(range(len(val_idx))):
    # W2V
    essay, essay_set, grade = essays_w2v_val[i], essay_sets_val[i], grades_val[i]
    essay_pad = F.pad(essay, (0, max_essay_length - essay.shape[1]))
    val_w2v.append(
        TensorDataset(essay_pad.view(1, essay_pad.shape[0], essay_pad.shape[1]), essay_set.view(1), grade.view(1)))
    
    # Glove
    essay = essays_glove_val[i]
    essay_pad = F.pad(essay, (0, max_essay_length - essay.shape[1]))
    val_glove.append(
        TensorDataset(essay_pad.view(1, essay_pad.shape[0], essay_pad.shape[1]), essay_set.view(1), grade.view(1)))

val_w2v, val_glove = ConcatDataset(val_w2v), ConcatDataset(val_glove)

# Test data
test_w2v, test_glove = [], []

print('Padding essays from test set...')
for i in tqdm(range(len(test_idx))):
    # W2V
    essay, essay_set, grade = essays_w2v_test[i], essay_sets_test[i], grades_test[i]
    essay_pad = F.pad(essay, (0, max_essay_length - essay.shape[1]))
    test_w2v.append(
        TensorDataset(essay_pad.view(1, essay_pad.shape[0], essay_pad.shape[1]), essay_set.view(1), grade.view(1)))
    
    # Glove
    essay = essays_glove_test[i]
    essay_pad = F.pad(essay, (0, max_essay_length - essay.shape[1]))
    test_glove.append(
        TensorDataset(essay_pad.view(1, essay_pad.shape[0], essay_pad.shape[1]), essay_set.view(1), grade.view(1)))

test_w2v, test_glove = ConcatDataset(test_w2v), ConcatDataset(test_glove)

Padding essays from training set...


100%|██████████| 10379/10379 [00:03<00:00, 2721.25it/s]


Padding essays from validation set...


100%|██████████| 1295/1295 [00:00<00:00, 2582.42it/s]


Padding essays from test set...


100%|██████████| 1303/1303 [00:00<00:00, 2049.01it/s]


In [63]:
# Save padded data sets
torch.save(train_w2v, 'data/train_w2v.pt')
torch.save(train_glove, 'data/train_glove.pt')
torch.save(val_w2v, f'data/val_w2v.pt')
torch.save(val_glove, f'data/val_glove.pt')
torch.save(test_w2v, f'data/test_w2v.pt')
torch.save(test_glove, f'data/test_glove.pt')