In [None]:
# You will need to download any word embeddings required for your code, e.g.:

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# For any packages that Colab does not provide auotmatically you will also need to install these below, e.g.:

#! pip install torch

In [1]:
# Imports
import regex as re
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs
from numpy import dot
from numpy.linalg import norm

In [2]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Load data
train_df = pd.read_csv('/work/NLP_cwk/data/task-1/train.csv')
test_df = pd.read_csv('/work/NLP_cwk/data/task-1/dev.csv')

In [24]:
# Number of epochs
epochs = 5

# Proportion of training data for train compared to dev
train_proportion = 0.8

In [5]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """
    train_RMSEs = []
    val_RMSEs = []

    print("Training model.")

    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:

            feature, original_mask, edit_mask, target = batch

            feature, original_mask, edit_mask, target = feature.to(device), original_mask.to(device), edit_mask.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature, original_mask, edit_mask).squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(dev_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations

        train_RMSEs.append(epoch_mse**0.5)
        val_RMSEs.append(valid_mse**0.5)

        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

    return train_RMSEs, val_RMSEs

In [6]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, original_mask, edit_mask, target = batch

            feature, original_mask, edit_mask, target = feature.to(device), original_mask.to(device), edit_mask.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature, original_mask, edit_mask).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [7]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [8]:
def create_vocab(original_data, edit_words):
    """
    Creating a corpus of all the tokens used
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list
    original_words = []
    for i in range(len(original_data)):
        sentence = original_data[i]
        edit_word = edit_words[i]

        old_words = re.findall("<(.*)/>", sentence)[0].split(' ')

        tokenized_sentence = []
        for token in re.split(' |<|/>',sentence): # simplest split is
            tokenized_sentence.append(token.lower())
            if(old_words[-1] == token):
                tokenized_sentence.append(edit_word.lower())

        tokenized_corpus.append(tokenized_sentence)
        original_words.append(old_words)


    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list

    for sentence in tokenized_corpus:

        for token in sentence:

            if token not in vocabulary:

                if True:
                    vocabulary.append(token)

    return vocabulary, tokenized_corpus, original_words

In [9]:
def collate_fn_padd(batch):
    '''
    We add padding to our minibatches and create tensors for our model
    '''

    batch_original_mask = [o_m for f, o_m, e_m, l in batch]
    batch_edit_mask = [e_m for f, o_m, e_m, l in batch]
    batch_labels = [l for f, o_m, e_m, l in batch]
    batch_features = [f for f, o_m, e_m, l in batch]

    batch_features_len = [len(f) for f, o_m, e_m, l in batch]

    seq_tensor = torch.zeros((len(batch), max(batch_features_len))).long()
    original_mask_tensor = torch.zeros((len(batch), max(batch_features_len))).long()
    edit_mask_tensor = torch.zeros((len(batch), max(batch_features_len))).long()

    for idx, (seq, seqlen) in enumerate(zip(batch_features, batch_features_len)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

    for idx, (mask, seqlen) in enumerate(zip(batch_original_mask, batch_features_len)):
        original_mask_tensor[idx, :seqlen] = torch.LongTensor(mask)

    for idx, (mask, seqlen) in enumerate(zip(batch_edit_mask, batch_features_len)):
        edit_mask_tensor[idx, :seqlen] = torch.LongTensor(mask)

    batch_labels = torch.FloatTensor(batch_labels)

    return seq_tensor, original_mask_tensor, edit_mask_tensor, batch_labels

class Task1Dataset(Dataset):

    def __init__(self, train_data, original_indices, edit_indices, labels):
        self.x_train = train_data
        self.y_train = labels
        self.original_indices = original_indices
        self.edit_indices = edit_indices

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, item):
        original_mask = torch.zeros(len(self.x_train[item])).long()
        original_mask[self.original_indices[item]] = 1

        edit_mask = torch.zeros(len(self.x_train[item])).long()
        edit_mask[self.edit_indices[item]] = 1

        return self.x_train[item], original_mask, edit_mask, self.y_train[item]

In [10]:
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device, experiment_type):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.experiment_type = experiment_type

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, dropout = 0.5)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim * 2, 1)
        self.hidden = self.init_hidden()

        #Dissimilarity score to funninness
        self.dissim2label = nn.Linear(1,1)

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence, original_mask, edit_mask):
        embedded = self.embedding(sentence)
        embedded = embedded.permute(1, 0, 2)

        lstm_out, self.hidden = self.lstm(
            embedded.view(len(embedded), self.batch_size, self.embedding_dim), self.hidden)

        original_mask = original_mask.transpose(0,1).unsqueeze(-1)
        edit_mask = edit_mask.transpose(0,1).unsqueeze(-1)

        original_lstm_out = lstm_out*original_mask
        edit_lstm_out = lstm_out*edit_mask

        original_lstm_out = original_lstm_out.sum(0)
        edit_lstm_out = edit_lstm_out.sum(0)

        original_lstm_out = original_lstm_out.div(original_mask.sum(0))
        edit_lstm_out = edit_lstm_out.div(edit_mask.sum(0))

        if  self.experiment_type == 'insert_after':
            out = self.hidden2label(lstm_out[-1])

        if self.experiment_type == 'original_representation':
            out = self.hidden2label(original_lstm_out)

        elif self.experiment_type == 'cosine_distance':
            cos_dissim = 1 - torch.sum(original_lstm_out* edit_lstm_out, dim = 1)/(torch.norm(original_lstm_out, dim =1 )*torch.norm(edit_lstm_out, dim =1))
            cos_dissim = cos_dissim.unsqueeze(-1)
            out = self.dissim2label(cos_dissim)

        elif self.experiment_type == 'simple_difference':
            out = original_lstm_out - edit_lstm_out
            out = self.hidden2label(out)
            
        return out

In [11]:
## Approach 1 code, using functions defined above:

# We set our training data and test data
train_original_data = train_df['original']
train_edit_words = train_df['edit']
test_original_data = test_df['original']
test_edit_words = test_df['edit']

# Creating word vectors
training_vocab, training_tokenized_corpus, training_original_words = create_vocab(train_original_data, train_edit_words)
test_vocab, test_tokenized_corpus, test_original_words = create_vocab(test_original_data, test_edit_words)

# Creating joint vocab from test and train:
joint_vocab, joint_tokenized_corpus, joint_original_words = create_vocab(
  pd.concat([train_original_data, test_original_data], ignore_index= True), 
  pd.concat([train_edit_words, test_edit_words], ignore_index= True))
print("Vocab created.")

Vocab created.


In [12]:
# We create representations for our tokens
wvecs = [] # word vectors
word2idx = [] # word2index
idx2word = []

# This is a large file, it will take a while to load in the memory!
with codecs.open('glove.6B.100d.txt', 'r','utf-8') as f:
  index = 1
  for line in f.readlines():
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in joint_vocab:
          (word, vec) = (word,
                     list(map(float,line.strip().split()[1:])))
          wvecs.append(vec)
          word2idx.append((word, index))
          idx2word.append((index, word))
          index += 1

word2idx.append(('<pad>', 0))
idx2word.append((0, '<pad>'))
wvecs = np.insert(wvecs, 0, 0., axis=0)
wvecs = np.array(wvecs)
word2idx = dict(word2idx)
idx2word = dict(idx2word)

vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in training_tokenized_corpus]

# To avoid any sentences being empty (if no words match to our word embeddings)
vectorized_seqs = [x if len(x) > 0 else [0] for x in vectorized_seqs]


In [13]:
#Find the position of the original words in the new vectorized sequence
corpus_original_indices = []
corpus_edit_indices = []
for i in range(len(vectorized_seqs)):
    sentence_original_indices = []
    sentence_edit_indices = []
    seq = [idx2word[index] for index in vectorized_seqs[i]]
    for word in training_original_words[i]:
        if(word.lower() in seq):
            sentence_original_indices.append(seq.index(word.lower()))
    if(train_edit_words[i].lower() in seq):
        sentence_edit_indices.append(seq.index(train_edit_words[i].lower()))
    if not sentence_original_indices:
        sentence_original_indices.append(-1)
    if not sentence_edit_indices:
        sentence_edit_indices.append(-1)
    corpus_original_indices.append(sentence_original_indices)
    corpus_edit_indices.append(sentence_edit_indices)

In [25]:
INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 100
BATCH_SIZE = 32
HIDDEN_DIM = 40
#Replicate experiment with cosine distance(edited_repr, original_repr) passed to output
#experiment_type = 'cosine_distance'
#Replicate experiment with last hidden state passed to output
#experiment_type = 'insert_after'
#Replicate experiment original_repr passed to output
#experiment_type = 'original_representation'
#Replicate experiment with difference(edited_repr, original_repr) passed to output
experiment_type = 'simple_difference'
#Specify following variable False to replicate experiments with embedding finetuned
freeze_embeddings = True


model = BiLSTM(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, BATCH_SIZE, device, experiment_type)
print("Model initialised.")

model.to(device)
# We provide the model with our embeddings
model.embedding.weight.data.copy_(torch.from_numpy(wvecs))
if(freeze_embeddings):
    model.embedding.weight.requires_grad = False

feature = vectorized_seqs

# 'feature' is a list of lists, each containing embedding IDs for word tokens
train_and_dev = Task1Dataset(feature, corpus_original_indices, corpus_edit_indices, train_df['meanGrade'])

train_examples = round(len(train_and_dev)*train_proportion)
dev_examples = len(train_and_dev) - train_examples

train_dataset, dev_dataset = random_split(train_and_dev,
                                           (train_examples,
                                            dev_examples))


train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

print("Dataloaders created.")

Model initialised.
Dataloaders created.


In [26]:
loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

optimizer = torch.optim.Adam(model.parameters())

train_RMSEs, val_RMSEs = train(train_loader, dev_loader, model, epochs)

Training model.
| Epoch: 01 | Train Loss: 0.46 | Train MSE: 0.46 | Train RMSE: 0.68 |         Val. Loss: 0.35 | Val. MSE: 0.35 |  Val. RMSE: 0.59 |
| Epoch: 02 | Train Loss: 0.32 | Train MSE: 0.32 | Train RMSE: 0.56 |         Val. Loss: 0.34 | Val. MSE: 0.34 |  Val. RMSE: 0.58 |
| Epoch: 03 | Train Loss: 0.29 | Train MSE: 0.29 | Train RMSE: 0.54 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.57 |
| Epoch: 04 | Train Loss: 0.27 | Train MSE: 0.27 | Train RMSE: 0.52 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.57 |
| Epoch: 05 | Train Loss: 0.24 | Train MSE: 0.24 | Train RMSE: 0.49 |         Val. Loss: 0.33 | Val. MSE: 0.33 |  Val. RMSE: 0.57 |


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cd0ae3bf-8b9c-46ca-9eb4-5ef4b3522174' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>