<a href="https://colab.research.google.com/github/XiangdiChai/nlp-cw-code-repo/blob/master/BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# You will need to download any word embeddings required for your code, e.g.:

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# For any packages that Colab does not provide auotmatically you will also need to install these below, e.g.:

#! pip install torch

--2021-02-26 15:45:42--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-02-26 15:45:42--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-02-26 15:45:42--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [None]:
# Imports

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs

In [None]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [None]:
# Load data
#feel free to edit the path
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')

In [None]:
# Number of epochs
epochs = 10

# Proportion of training data for train compared to dev
train_proportion = 0.8

In [None]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    
    print("Training model.")
    best_loss = 10
    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far
        
        for batch in train_iter:
            
            feature, target = batch

            feature, target = feature.to(device), target.to(device)
            
            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()
            predictions = model(feature).squeeze(1)
            
            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(dev_iter, model)
        
        if valid_loss < best_loss:
          best_loss = valid_loss 
          
          torch.save(model.state_dict(),  './sample_data/BiLSTM_model.pth')
          
          print('model save ', best_loss)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.4f} |')

In [None]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [None]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [None]:
import string
import re


def create_vocab(data):
    """
    Creating a corpus of all the tokens used
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list

    vocabulary = []

    for sentence in data:
        
        sentence = sentence.lower() #lower case
        sentence = sentence.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
        sentence = re.sub(r'\d+', '', sentence) #remove number
        
        tokenized_sentence = []
        
        for token in sentence.split(' '): 

            tokenized_sentence.append(token)

            if token not in vocabulary:
                
                vocabulary.append(token)


        tokenized_corpus.append(tokenized_sentence)

    return vocabulary, tokenized_corpus

In [None]:
def collate_fn_padd(batch):
    '''
    We add padding to our minibatches and create tensors for our model
    '''

    batch_labels = [l for f, l in batch]
    batch_features = [f for f, l in batch]

    batch_features_len = [len(f) for f, l in batch]

    seq_tensor = torch.zeros((len(batch), max(batch_features_len))).long()

    for idx, (seq, seqlen) in enumerate(zip(batch_features, batch_features_len)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

    batch_labels = torch.FloatTensor(batch_labels)

    return seq_tensor, batch_labels

class Task1Dataset(Dataset):

    def __init__(self, train_data, labels):
        self.x_train = train_data
        self.y_train = labels

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, item):
        return self.x_train[item], self.y_train[item]

In [None]:
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        
        self.hidden2label = nn.Sequential(nn.Linear(hidden_dim * 2, hidden_dim),  #nn.Linear(hidden_dim * 2, 1)
                          nn.LeakyReLU(),
                          nn.Linear(hidden_dim, hidden_dim//2),
                          nn.LeakyReLU(),
                          nn.Linear(hidden_dim//2, 1),
                          nn.LeakyReLU())
       
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence):
       
        embedded = self.embedding(sentence)
        embedded = embedded.permute(1, 0, 2)

        lstm_out, self.hidden = self.lstm(
            embedded.view(len(embedded), self.batch_size, self.embedding_dim), self.hidden)
        out = self.hidden2label(lstm_out[-1])
        return out

In [None]:

def pre_processing(method):

  training_origin_data = train_df['original']
  training_edit_data = train_df['edit']
  dev_origin_data = dev_df['original']
  dev_edit_data = dev_df['edit']
  test_origin_data = test_df['original']
  test_edit_data = test_df['edit']

  training_data = []
  dev_data = []
  test_data = []


  #Replace the word in the bracket with the edit word 
  if method == 1: 

    for i in range(len(training_origin_data)): 
      training_data.append(training_origin_data[i].replace(training_origin_data[i][training_origin_data[i].find("<") : training_origin_data[i].find(">")+1],training_edit_data[i]))
    for i in range(len(dev_origin_data)): 
      dev_data.append(dev_origin_data[i].replace(dev_origin_data[i][dev_origin_data[i].find("<") : dev_origin_data[i].find(">")+1],dev_edit_data[i]))
    for i in range(len(test_origin_data)): 
      test_data.append(test_origin_data[i].replace(test_origin_data[i][test_origin_data[i].find("<") : test_origin_data[i].find(">")+1],test_edit_data[i]))

  #Show both original sentence and the edit sentence to the network
  elif method == 2:
    for i in range(len(training_origin_data)): 
      training_data.append(training_origin_data[i]+' '+training_origin_data[i].replace(training_origin_data[i][training_origin_data[i].find("<") : training_origin_data[i].find(">")+1],training_edit_data[i]))
    for i in range(len(dev_origin_data)): 
      dev_data.append(dev_origin_data[i]+' '+dev_origin_data[i].replace(dev_origin_data[i][dev_origin_data[i].find("<") : dev_origin_data[i].find(">")+1],dev_edit_data[i]))
    for i in range(len(test_origin_data)): 
      test_data.append(test_origin_data[i]+' '+test_origin_data[i].replace(test_origin_data[i][test_origin_data[i].find("<") : test_origin_data[i].find(">")+1],test_edit_data[i]))
  
  #Append the edit word at the end of the original sentence.
  elif method == 3:

    for i in range(len(training_origin_data)): 
      training_data.append(training_origin_data[i]+' '+training_edit_data[i])
    for i in range(len(dev_origin_data)): 
      dev_data.append(dev_origin_data[i]+' '+dev_edit_data[i])  
    for i in range(len(test_origin_data)):
      test_data.append(test_origin_data[i]+' '+test_edit_data[i])

  #Append the edit word after the word needs to be replaced. 
  elif method == 4:
    for i in range(len(training_origin_data)): 
      word = training_origin_data[i][training_origin_data[i].find("<") : training_origin_data[i].find(">")]
      training_data.append(training_origin_data[i].replace(word, word+' '+training_edit_data[i]))

    for i in range(len(dev_origin_data)): 
      word = dev_origin_data[i][dev_origin_data[i].find("<") : dev_origin_data[i].find(">")]
      dev_data.append(dev_origin_data[i].replace(word, word+' '+dev_edit_data[i]))

    for i in range(len(test_origin_data)): 
      word = test_origin_data[i][test_origin_data[i].find("<") : test_origin_data[i].find(">")]
      test_data.append(test_origin_data[i].replace(word, word+' '+test_edit_data[i]))

  return training_data,dev_data,test_data


 

## change preprocessing method


In [None]:
training_data,dev_data,test_data = pre_processing(3)

In [None]:
# Creating word vectors
training_vocab, training_tokenized_corpus = create_vocab(training_data)
dev_vocab, dev_tokenized_corpus = create_vocab(dev_data)
test_vocab, test_tokenized_corpus = create_vocab(test_data)


# Creating joint vocab from test and train:
joint_vocab, joint_tokenized_corpus = create_vocab(training_data+dev_data+test_data)

print("Vocab created.")


# We create representations for our tokens
wvecs = [] # word vectors
word2idx = [] # word2index
idx2word = []

# This is a large file, it will take a while to load in the memory!

#Experiment with differet embedding dimension
#with codecs.open('glove.6B.50d.txt', 'r','utf-8') as f:
#with codecs.open('glove.6B.100d.txt', 'r','utf-8') as f:
with codecs.open('glove.6B.200d.txt', 'r','utf-8') as f:
#with codecs.open('glove.6B.300d.txt', 'r','utf-8') as f:
  index = 0
  for line in f.readlines():
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in joint_vocab:
          (word, vec) = (word,
                     list(map(float,line.strip().split()[1:])))
          wvecs.append(vec)
          word2idx.append((word, index))
          idx2word.append((index, word))
          index += 1

wvecs = np.array(wvecs)
word2idx = dict(word2idx)
idx2word = dict(idx2word)

vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in  training_tokenized_corpus]

# To avoid any sentences being empty (if no words match to our word embeddings)
vectorized_seqs = [x if len(x) > 0 else [0] for x in vectorized_seqs]

dev_vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in dev_tokenized_corpus]
dev_vectorized_seqs = [x if len(x) > 0 else [0] for x in dev_vectorized_seqs]


test_vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in  test_tokenized_corpus]
test_vectorized_seqs = [x if len(x) > 0 else [0] for x in test_vectorized_seqs]



Vocab created.


In [None]:
INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 200
BATCH_SIZE = 32

model = BiLSTM(EMBEDDING_DIM, 256, INPUT_DIM, BATCH_SIZE, device)
print("Model initialised.")

model.to(device)
# We provide the model with our embeddings
model.embedding.weight.data.copy_(torch.from_numpy(wvecs))

feature = vectorized_seqs
dev_feature = dev_vectorized_seqs

# 'feature' is a list of lists, each containing embedding IDs for word tokens
training = Task1Dataset(feature, train_df['meanGrade'])
dev = Task1Dataset(dev_feature, dev_df['meanGrade'])

train_loader = torch.utils.data.DataLoader(training, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
dev_loader = torch.utils.data.DataLoader(dev, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

print("Dataloaders created.")

loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

optimizer = torch.optim.Adam(model.parameters(),lr = 0.0005)

train(train_loader, dev_loader, model, epochs)



Model initialised.
Dataloaders created.
Training model.
model save  0.34302689283531823
| Epoch: 01 | Train Loss: 0.37 | Train MSE: 0.37 | Train RMSE: 0.61 |         Val. Loss: 0.34 | Val. MSE: 0.34 |  Val. RMSE: 0.5857 |
model save  0.3054742734754514
| Epoch: 02 | Train Loss: 0.33 | Train MSE: 0.33 | Train RMSE: 0.57 |         Val. Loss: 0.31 | Val. MSE: 0.31 |  Val. RMSE: 0.5527 |
model save  0.29814197924473207
| Epoch: 03 | Train Loss: 0.27 | Train MSE: 0.27 | Train RMSE: 0.52 |         Val. Loss: 0.30 | Val. MSE: 0.30 |  Val. RMSE: 0.5460 |
| Epoch: 04 | Train Loss: 0.21 | Train MSE: 0.21 | Train RMSE: 0.46 |         Val. Loss: 0.33 | Val. MSE: 0.33 |  Val. RMSE: 0.5774 |
| Epoch: 05 | Train Loss: 0.18 | Train MSE: 0.18 | Train RMSE: 0.42 |         Val. Loss: 0.33 | Val. MSE: 0.33 |  Val. RMSE: 0.5710 |
| Epoch: 06 | Train Loss: 0.15 | Train MSE: 0.15 | Train RMSE: 0.38 |         Val. Loss: 0.38 | Val. MSE: 0.38 |  Val. RMSE: 0.6128 |
| Epoch: 07 | Train Loss: 0.12 | Train MSE: 0

In [None]:
#get the prediction for test dataset 

test_feature = test_vectorized_seqs

test = Task1Dataset(test_feature, train_df['meanGrade'][0:len(test_feature)])
test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

predictions = []
best_model = BiLSTM(EMBEDDING_DIM, 256, INPUT_DIM, BATCH_SIZE, device)
best_model.load_state_dict(torch.load('/content/sample_data/BiLSTM_model.pth'))

best_model.eval()
with torch.no_grad():
  for batch in test_loader:
    feature, target = batch
    feature, target = feature.to(device), target.to(device)
    feature = torch.LongTensor(feature)
    best_model.batch_size = target.shape[0]
    best_model.hidden = best_model.init_hidden()
    prediction = best_model(feature).squeeze(1)
    pred = prediction.detach().cpu().numpy()  
    pred = np.reshape(pred,(len(pred),1))
    for p in pred:
      predictions.append(p)

predictions = np.asarray(predictions)

In [None]:
#write out csv file for the predict result
import csv
out_loc = '/content/sample_data/task-1-output.csv'
with open(out_loc, "w") as f:
    writer = csv.writer(f)
    writer.writerow(('id','pred'))
    for i in range(len(test_df['id'])):
        writer.writerow((test_df['id'][i],predictions[i][0]))
