### Coursework coding instructions (please also see full coursework spec)

Please choose if you want to do either Task 1 or Task 2. You should write your report about one task only.

For the task you choose you will need to do two approaches:
  - Approach 1, which can use use pre-trained embeddings / models
  - Approach 2, which should not use any pre-trained embeddings or models
We should be able to run both approaches from the same colab file

#### Running your code:
  - Your models should run automatically when running your colab file without further intervention
  - For each task you should automatically output the performance of both models
  - Your code should automatically download any libraries required

#### Structure of your code:
  - You are expected to use the 'train', 'eval' and 'model_performance' functions, although you may edit these as required
  - Otherwise there are no restrictions on what you can do in your code

#### Documentation:
  - You are expected to produce a .README file summarising how you have approached both tasks

#### Reproducibility:
  - Your .README file should explain how to replicate the different experiments mentioned in your report

Good luck! We are really looking forward to seeing your reports and your model code!

In [22]:
# Parameters
TWITTER = True
ADD_ORIGINAL_DATA = True
REMOVE_STOPWORDS = True

In [23]:
# You will need to download any word embeddings required for your code, e.g.:

!wget -nc http://nlp.stanford.edu/data/glove.6B.zip
!unzip -n glove.6B.zip
!wget -nc http://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip -n glove.twitter.27B.zip
# For any packages that Colab does not provide auotmatically you will also need to install these below, e.g.:

#! pip install torch

File ‘glove.6B.zip’ already there; not retrieving.

Archive:  glove.6B.zip
File ‘glove.twitter.27B.zip’ already there; not retrieving.

Archive:  glove.twitter.27B.zip


In [24]:
# Imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import remove_stopwords


In [25]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)

cuda:0


In [26]:
# Load data
!wget -nc https://cs.rochester.edu/u/nhossain/semeval-2020-task-7-dataset.zip
!unzip -n semeval-2020-task-7-dataset.zip

File ‘semeval-2020-task-7-dataset.zip’ already there; not retrieving.

Archive:  semeval-2020-task-7-dataset.zip


In [27]:
train_df = pd.read_csv('semeval-2020-task-7-dataset/subtask-1/train.csv')
dev_df = pd.read_csv('semeval-2020-task-7-dataset/subtask-1/dev.csv')
test_df = pd.read_csv('semeval-2020-task-7-dataset/subtask-1/test.csv')

In [28]:
# Proportion of training data for train compared to dev
train_proportion = 0.8

In [29]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """
    
    print("Training model.")

    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:

            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)

            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(dev_iter, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

In [30]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [31]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [32]:
def create_vocab(data):
    """
    Creating a corpus of all the tokens used
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list

    for sentence in data:

        tokenized_sentence = []

        for token in sentence.split(' '): # simplest split is
            
            # Don't count empty strinng
            if token:
                tokenized_sentence.append(token)

        tokenized_corpus.append(tokenized_sentence)

    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list
    vocab_counts = {}

    for sentence in tokenized_corpus:

        for token in sentence:

            token = token.lower()

            if token not in vocabulary:

                if True:
                    vocabulary.append(token)
                    vocab_counts[token] = 1
            else:
                vocab_counts[token] += 1

    return vocabulary, tokenized_corpus, vocab_counts

In [33]:
def collate_fn_padd(batch):
    '''
    We add padding to our minibatches and create tensors for our model
    '''

    batch_labels = [l for f, l in batch]
    batch_features = [f for f, l in batch]

    batch_features_len = [len(f) for f, l in batch]

    seq_tensor = torch.zeros((len(batch), max(batch_features_len))).long()

    for idx, (seq, seqlen) in enumerate(zip(batch_features, batch_features_len)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

    batch_labels = torch.FloatTensor(batch_labels)

    return seq_tensor, batch_labels

class Task1Dataset(Dataset):

    def __init__(self, train_data, labels):
        self.x_train = train_data
        self.y_train = labels

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, item):
        return self.x_train[item], self.y_train[item]

In [34]:
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim * 2, 1)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence):
        embedded = self.embedding(sentence)
        embedded = embedded.permute(1, 0, 2)

        lstm_out, self.hidden = self.lstm(
            embedded.view(len(embedded), self.batch_size, self.embedding_dim), self.hidden)

        out = self.hidden2label(lstm_out[-1])
        return out

In [35]:
## Approach 1 code, using functions defined above:

# We set our training data and dev data
training_data = train_df['original']
dev_data = dev_df['original']

training_edit = train_df['edit']
dev_edit = dev_df['edit']

# Remove Stop words
if REMOVE_STOPWORDS:
  training_data = pd.Series([ remove_stopwords(s) for s in training_data])
  dev_data = pd.Series([remove_stopwords(s) for s in dev_data])

# Perform substitutions
training_data_original = pd.Series([re.sub('<|\/>', '', s) for s in training_data])
training_data_edit = pd.Series([re.sub('<.*\/>', e, s) for s, e in zip(training_data, training_edit)])

training_data = pd.concat([training_data_edit, training_data_original]) \
                   if ADD_ORIGINAL_DATA else training_data_edit

dev_data_original = pd.Series([re.sub('<|\/>', '', s) for s in dev_data])
dev_data_edit = pd.Series([re.sub('<.*\/>', e, s) for s, e in zip(dev_data, dev_edit)])

dev_data = pd.concat([dev_data_edit, dev_data_original]) \
                if ADD_ORIGINAL_DATA else dev_data_edit

all_data = pd.concat([training_data, dev_data])
# Order of all data = [training_data_edit, training_data_original, dev_data_edit, dev_data_original]

# Creating joint vocab from dev and train:
joint_vocab, joint_tokenized_corpus, joint_vocab_counts = create_vocab(all_data)

if ADD_ORIGINAL_DATA:
  zeroes_train = [0] * len(train_df['meanGrade'])
  zeroes_dev  = [0] * len(dev_df['meanGrade'])

grades = list(train_df['meanGrade'])     # train edit graids
if ADD_ORIGINAL_DATA:
  grades.extend(zeroes_train)            #   + train original graids
grades.extend(list(dev_df['meanGrade'])) #   + dev edit grades
if ADD_ORIGINAL_DATA:
  grades.extend(zeroes_dev)              #   + dev original graids

print("Vocab created.")
print("Vocab size: ", len(joint_vocab))
print("Data size: ", len(all_data))
print("labels size: ", len(grades))

Vocab created.
Vocab size:  12171
Data size:  24142
labels size:  24142


In [36]:
from tqdm import tqdm

EMBEDDING_DIM = 200

# We create representations for our tokens
wvecs = [[0] * EMBEDDING_DIM] # word vectors
word2idx = [('<pad>', 0)] # word2index
idx2word = [(0, '<pad>')]

glove_file = 'glove.twitter.27B.200d.txt' if TWITTER else 'glove.6B.200d.txt'

# This is a large file, it will take a while to load in the memory!
with codecs.open(glove_file, 'r','utf-8') as f:
  index = 1
  for line in tqdm(f.readlines()):
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in joint_vocab:
          (word, vec) = (word,
                     list(map(float,line.strip().split()[1:])))
          wvecs.append(vec)
          word2idx.append((word, index))
          idx2word.append((index, word))
          index += 1

wvecs = np.array(wvecs)
word2idx = dict(word2idx)
idx2word = dict(idx2word)

100%|██████████| 1193515/1193515 [02:23<00:00, 8292.41it/s]


In [37]:
def check_coverage(vocab, vocab_counts, embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in vocab:
        try:
            a[word] = embeddings_index[word]
            k += vocab_counts[word]
        except:

            oov[word] = vocab_counts[word]
            i += vocab_counts[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=lambda x: x[1])[::-1]

    return sorted_x

missing_words = check_coverage(joint_vocab, joint_vocab_counts, word2idx)

print(missing_words[:10])

print(missing_words[-50:])

Found embeddings for 93.11% of vocab
Found embeddings for  97.16% of all text
[('manafort', 143), ('10', 136), ('2018', 132), ('tillerson', 124), ('2', 110), ('brexit', 110), ('2016', 102), ('5', 94), ('2017', 76), ('trump-russia', 76)]
[('vellicate', 1), ('regurgitator', 1), ('handedness', 1), ('misanthropics', 1), ('haemorrhoid', 1), ('coracobrachialis', 1), ('footrace', 1), ('catawampus', 1), ('misplaces', 1), ('defenestrates', 1), ('concocts', 1), ('remarries', 1), ('myopics', 1), ('roundness', 1), ('exhumes', 1), ('ventilates', 1), ('halloweeners', 1), ('sconces', 1), ('ventral', 1), ('scuffling', 1), ('extrapolates', 1), ('hairpieces', 1), ('coherency', 1), ('believability', 1), ('philandering', 1), ('wallpapering', 1), ('compunction', 1), ('mispronunciations', 1), ('immigrating', 1), ('transtemporalize', 1), ('bagpiping', 1), ('delousing', 1), ('chauvinists', 1), ('immolating', 1), ('stupefying', 1), ('blintz', 1), ('sashays', 1), ('reanimating', 1), ('straggled', 1), ('nightgow

In [38]:
vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in joint_tokenized_corpus]

# To avoid any sentences being empty (if no words match to our word embeddings)
vectorized_seqs = [x if len(x) > 0 else [0] for x in vectorized_seqs]

INPUT_DIM = len(word2idx)
BATCH_SIZE = 32

# Number of epochs
epochs = 50

model = BiLSTM(EMBEDDING_DIM, 50, INPUT_DIM, BATCH_SIZE, device)
print("Model initialised.")

model.to(device)
# We provide the model with our embeddings
model.embedding.weight.data.copy_(torch.from_numpy(wvecs))

feature = vectorized_seqs

# 'feature' is a list of lists, each containing embedding IDs for word tokens
train_and_dev = Task1Dataset(feature, grades)

train_examples = round(len(train_and_dev)*train_proportion)
dev_examples = len(train_and_dev) - train_examples

train_dataset, dev_dataset = random_split(train_and_dev, (train_examples, dev_examples))

train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

print("Dataloaders created.")

loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

train(train_loader, dev_loader, model, epochs)

Model initialised.
Dataloaders created.
Training model.
| Epoch: 01 | Train Loss: 0.49 | Train MSE: 0.49 | Train RMSE: 0.70 |         Val. Loss: 0.40 | Val. MSE: 0.40 |  Val. RMSE: 0.63 |
| Epoch: 02 | Train Loss: 0.39 | Train MSE: 0.39 | Train RMSE: 0.62 |         Val. Loss: 0.38 | Val. MSE: 0.38 |  Val. RMSE: 0.62 |
| Epoch: 03 | Train Loss: 0.37 | Train MSE: 0.37 | Train RMSE: 0.61 |         Val. Loss: 0.37 | Val. MSE: 0.37 |  Val. RMSE: 0.61 |
| Epoch: 04 | Train Loss: 0.36 | Train MSE: 0.36 | Train RMSE: 0.60 |         Val. Loss: 0.36 | Val. MSE: 0.36 |  Val. RMSE: 0.60 |
| Epoch: 05 | Train Loss: 0.35 | Train MSE: 0.35 | Train RMSE: 0.59 |         Val. Loss: 0.35 | Val. MSE: 0.35 |  Val. RMSE: 0.59 |
| Epoch: 06 | Train Loss: 0.34 | Train MSE: 0.34 | Train RMSE: 0.58 |         Val. Loss: 0.35 | Val. MSE: 0.35 |  Val. RMSE: 0.59 |
| Epoch: 07 | Train Loss: 0.33 | Train MSE: 0.33 | Train RMSE: 0.57 |         Val. Loss: 0.34 | Val. MSE: 0.34 |  Val. RMSE: 0.58 |
| Epoch: 08 | Train 

In [39]:
# Test set
test_data = test_df['original']
test_edit = test_df['edit']

test_data = pd.Series([remove_stopwords(s) for s in test_data])

test_data_edit = pd.Series([re.sub('<.*\/>', e, s) for s, e in zip(test_data, test_edit)])
test_vocab, test_tokenized_corpus, test_vocab_counts = create_vocab(test_data_edit)

vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in test_tokenized_corpus]

# To avoid any sentences being empty (if no words match to our word embeddings)
vectorized_seqs = [x if len(x) > 0 else [0] for x in vectorized_seqs]

INPUT_DIM = len(word2idx)
BATCH_SIZE = 32

test_dataset = Task1Dataset(vectorized_seqs, test_df["meanGrade"])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

loss, mse, __, __ = eval(test_loader, model)
print("Test Loss: ", loss)
print("Test MSE: ", mse)
print("Test RMSE: ", mse**0.5)

Test Loss:  0.5660502775005563
Test MSE:  0.5660502806542411
Test RMSE:  0.7523631308445684
