<a href="https://colab.research.google.com/github/XiangdiChai/nlp-cw-code-repo/blob/master/approach2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, random_split
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import codecs

In [None]:
# Load data
#feel free to edit the path
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')
news_df = pd.read_csv('abcnews-date-text.csv')

In [None]:
# Number of epochs
epochs = 10

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")


In [None]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    
    print("Training model.")
    best_loss = 10
    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far
        
        for batch in train_iter:
            
            feature, target = batch

            feature, target = feature.to(device), target.to(device)
            
            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()
            predictions = model(feature).squeeze(1)
            
            optimizer.zero_grad()

            loss = loss_fn(predictions, target)

            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(dev_iter, model)
        
        if valid_loss < best_loss:
          best_loss = valid_loss 
          
          torch.save(model.state_dict(),  './sample_data/BiLSTM_model_1.pth')
          
          print('model save ', best_loss)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.4f} |')

In [None]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [None]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [None]:
import string
import re


def create_vocab(data):
    """
    Creating a corpus of all the tokens used
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list

    vocabulary = []

    for sentence in data:
        
        sentence = sentence.lower() #lower case
        sentence = sentence.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
        sentence = re.sub(r'\d+', '', sentence) #remove number
        
        tokenized_sentence = []
        
        for token in sentence.split(' '): 

            tokenized_sentence.append(token)

            if token not in vocabulary:
                
                vocabulary.append(token)


        tokenized_corpus.append(tokenized_sentence)

    return vocabulary, tokenized_corpus

In [None]:
def collate_fn_padd(batch):
    '''
    We add padding to our minibatches and create tensors for our model
    '''

    batch_labels = [l for f, l in batch]
    batch_features = [f for f, l in batch]

    batch_features_len = [len(f) for f, l in batch]

    seq_tensor = torch.zeros((len(batch), max(batch_features_len))).long()

    for idx, (seq, seqlen) in enumerate(zip(batch_features, batch_features_len)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

    batch_labels = torch.FloatTensor(batch_labels)

    return seq_tensor, batch_labels

class Task1Dataset(Dataset):

    def __init__(self, train_data, labels):
        self.x_train = train_data
        self.y_train = labels

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, item):
        return self.x_train[item], self.y_train[item]

In [None]:
class BiLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        
        self.hidden2label = nn.Sequential(nn.Linear(hidden_dim * 2, hidden_dim),  #nn.Linear(hidden_dim * 2, 1)
                          nn.LeakyReLU(),
                          nn.Linear(hidden_dim, hidden_dim//2),
                          nn.LeakyReLU(),
                          nn.Linear(hidden_dim//2, 1),
                          nn.LeakyReLU())
       
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence):
       
        embedded = self.embedding(sentence)
        embedded = embedded.permute(1, 0, 2)

        lstm_out, self.hidden = self.lstm(
            embedded.view(len(embedded), self.batch_size, self.embedding_dim), self.hidden)
        out = self.hidden2label(lstm_out[-1])
        return out

In [None]:

def pre_processing(method):

  training_origin_data = train_df['original']
  training_edit_data = train_df['edit']
  dev_origin_data = dev_df['original']
  dev_edit_data = dev_df['edit']
  test_origin_data = test_df['original']
  test_edit_data = test_df['edit']

  training_data = []
  dev_data = []
  test_data = []


  #Replace the word in the bracket with the edit word 
  if method == 1: 

    for i in range(len(training_origin_data)): 
      training_data.append(training_origin_data[i].replace(training_origin_data[i][training_origin_data[i].find("<") : training_origin_data[i].find(">")+1],training_edit_data[i]))
    for i in range(len(dev_origin_data)): 
      dev_data.append(dev_origin_data[i].replace(dev_origin_data[i][dev_origin_data[i].find("<") : dev_origin_data[i].find(">")+1],dev_edit_data[i]))
    for i in range(len(test_origin_data)): 
      test_data.append(test_origin_data[i].replace(test_origin_data[i][test_origin_data[i].find("<") : test_origin_data[i].find(">")+1],test_edit_data[i]))

  #Show both original sentence and the edit sentence to the network
  elif method == 2:
    for i in range(len(training_origin_data)): 
      training_data.append(training_origin_data[i]+' '+training_origin_data[i].replace(training_origin_data[i][training_origin_data[i].find("<") : training_origin_data[i].find(">")+1],training_edit_data[i]))
    for i in range(len(dev_origin_data)): 
      dev_data.append(dev_origin_data[i]+' '+dev_origin_data[i].replace(dev_origin_data[i][dev_origin_data[i].find("<") : dev_origin_data[i].find(">")+1],dev_edit_data[i]))
    for i in range(len(test_origin_data)): 
      test_data.append(test_origin_data[i]+' '+test_origin_data[i].replace(test_origin_data[i][test_origin_data[i].find("<") : test_origin_data[i].find(">")+1],test_edit_data[i]))
  
  #Append the edit word at the end of the original sentence.
  elif method == 3:

    for i in range(len(training_origin_data)): 
      training_data.append(training_origin_data[i]+' '+training_edit_data[i])
    for i in range(len(dev_origin_data)): 
      dev_data.append(dev_origin_data[i]+' '+dev_edit_data[i])  
    for i in range(len(test_origin_data)):
      test_data.append(test_origin_data[i]+' '+test_edit_data[i])

  #Append the edit word after the word needs to be replaced. 
  elif method == 4:
    for i in range(len(training_origin_data)): 
      word = training_origin_data[i][training_origin_data[i].find("<") : training_origin_data[i].find(">")]
      training_data.append(training_origin_data[i].replace(word, word+' '+training_edit_data[i]))

    for i in range(len(dev_origin_data)): 
      word = dev_origin_data[i][dev_origin_data[i].find("<") : dev_origin_data[i].find(">")]
      dev_data.append(dev_origin_data[i].replace(word, word+' '+dev_edit_data[i]))

    for i in range(len(test_origin_data)): 
      word = test_origin_data[i][test_origin_data[i].find("<") : test_origin_data[i].find(">")]
      test_data.append(test_origin_data[i].replace(word, word+' '+test_edit_data[i]))

  return training_data,dev_data,test_data


 

In [None]:
def word_corpus(method):
    training_origin_data = train_df['original']
    training_edit_data = train_df['edit']
 
    dev_origin_data = dev_df['original']
    dev_edit = dev_df['edit']

    test_origin_data = test_df['original']
    test_edit_data = test_df['edit']

    

    add_data = []

    #use training, validation and test original sentence
    for i in range(len(training_origin_data)): 
        add_data.append(training_origin_data[i])
    for i in range(len(dev_origin_data)): 
        add_data.append(dev_origin_data[i])
    for i in range(len(test_origin_data)):
        add_data.append(test_origin_data[i])   
    if method == 1:
        return add_data

    #add additional news headlines
    news_data = news_df['headline_text']
    for i in range(len(news_data)):
        add_data.append(news_data[i])  
    if method == 2:
      return add_data
        
    
    #add edit sentence into the dataset        
    for i in range(len(training_origin_data)): 
        add_data.append(training_origin_data[i].replace(training_origin_data[i][training_origin_data[i].find("<") : training_origin_data[i].find(">")+1],training_edit_data[i]))
    for i in range(len(dev_origin_data)): 
        add_data.append(dev_origin_data[i].replace(dev_origin_data[i][dev_origin_data[i].find("<") : dev_origin_data[i].find(">")+1],dev_edit_data[i]))
    for i in range(len(test_origin_data)): 
        add_data.append(test_origin_data[i].replace(test_origin_data[i][test_origin_data[i].find("<") : test_origin_data[i].find(">")+1],test_edit_data[i]))
    if method == 3:
      return add_data






In [None]:
#change number in word_corpus(number) to experience with different corpus
#if you change to number 2 and 3, it will take a while to trian the word embedding
add_data = word_corpus(2)
news_vocab, news_tokenized_corpus = create_vocab(add_data)


In [None]:
#train own word to vector model
own_embedding_model = Word2Vec(news_tokenized_corpus, min_count=0, size = 200, window = 3, iter = 30)
own_embedding_model.wv.save_word2vec_format("./own_model.txt")
own_embedding_model.save("./word2vec.model")

In [None]:
print(own_embedding_model.most_similar('trump'))

  """Entry point for launching an IPython kernel.


[('trumps', 0.6814748048782349), ('obama', 0.6776908040046692), ('turmp', 0.6068863868713379), ('putin', 0.5586628913879395), ('russia', 0.5517419576644897), ('obamas', 0.5415406227111816), ('assad', 0.5403751730918884), ('clinton', 0.5335540175437927), ('hillary', 0.5273513793945312), ('donald', 0.5260993242263794)]


In [None]:

training_data,dev_data,test_data = pre_processing(3)

# Creating word vectors
training_vocab, training_tokenized_corpus = create_vocab(training_data)
dev_vocab, dev_tokenized_corpus = create_vocab(dev_data)
test_vocab, test_tokenized_corpus = create_vocab(test_data)


# Creating joint vocab from test and train:
joint_vocab, joint_tokenized_corpus = create_vocab(training_data+test_data+dev_data)

print("Vocab created.")


idx_2_word = {}
word_2_idx = {}
word_vec = []

#Change if you have not run the training w2v code above 
#with codecs.open('own_model_method2_200d.txt', 'r','utf-8') as f:
#with codecs.open('own_model_method3_200d.txt', 'r','utf-8') as f:
with codecs.open('own_model.txt', 'r','utf-8') as f:
  index = 0
  for line in f.readlines():
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in joint_vocab:        
        (word, vec) = (word,list(map(float,line.strip().split()[1:])))
        word_vec.append(vec)#own_embedding_model[word])
        idx_2_word[index] = word
        word_2_idx[word] = index
        index += 1

vectorized_seqs = [[word_2_idx[tok] for tok in seq if tok in word_2_idx] for seq in training_tokenized_corpus]

vectorized_seqs = [x if len(x) > 0 else [0] for x in vectorized_seqs]


dev_vectorized_seqs = [[word_2_idx[tok] for tok in seq if tok in word_2_idx] for seq in dev_tokenized_corpus]

dev_vectorized_seqs = [x if len(x) > 0 else [0] for x in dev_vectorized_seqs]


test_vectorized_seqs = [[word_2_idx[tok] for tok in seq if tok in word_2_idx] for seq in  test_tokenized_corpus]
test_vectorized_seqs = [x if len(x) > 0 else [0] for x in test_vectorized_seqs]

 

Vocab created.


In [None]:
word_vec = np.asarray(word_vec)
INPUT_DIM = len(word_2_idx)
EMBEDDING_DIM = 200
BATCH_SIZE = 32

model = BiLSTM(EMBEDDING_DIM, 256, INPUT_DIM, BATCH_SIZE, device)
print("Model initialised.")

model.to(device)
# We provide the model with our embeddings
model.embedding.weight.data.copy_(torch.from_numpy(word_vec))


feature = vectorized_seqs
dev_feature = dev_vectorized_seqs

# 'feature' is a list of lists, each containing embedding IDs for word tokens
training = Task1Dataset(feature, train_df['meanGrade'])
dev = Task1Dataset(dev_feature, dev_df['meanGrade'])

train_loader = torch.utils.data.DataLoader(training, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
dev_loader = torch.utils.data.DataLoader(dev, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

print("Dataloaders created.")

loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

optimizer = torch.optim.Adam(model.parameters())

train(train_loader, dev_loader, model, epochs)

Model initialised.
Dataloaders created.
Training model.
model save  0.35392256638588815
| Epoch: 01 | Train Loss: 0.36 | Train MSE: 0.36 | Train RMSE: 0.60 |         Val. Loss: 0.35 | Val. MSE: 0.35 |  Val. RMSE: 0.5949 |
model save  0.3344501690240673
| Epoch: 02 | Train Loss: 0.35 | Train MSE: 0.35 | Train RMSE: 0.59 |         Val. Loss: 0.33 | Val. MSE: 0.33 |  Val. RMSE: 0.5783 |
model save  0.31975420928780035
| Epoch: 03 | Train Loss: 0.32 | Train MSE: 0.32 | Train RMSE: 0.57 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.5655 |
| Epoch: 04 | Train Loss: 0.25 | Train MSE: 0.25 | Train RMSE: 0.50 |         Val. Loss: 0.34 | Val. MSE: 0.34 |  Val. RMSE: 0.5856 |
| Epoch: 05 | Train Loss: 0.18 | Train MSE: 0.18 | Train RMSE: 0.42 |         Val. Loss: 0.36 | Val. MSE: 0.36 |  Val. RMSE: 0.6015 |
| Epoch: 06 | Train Loss: 0.13 | Train MSE: 0.13 | Train RMSE: 0.36 |         Val. Loss: 0.39 | Val. MSE: 0.39 |  Val. RMSE: 0.6255 |
| Epoch: 07 | Train Loss: 0.10 | Train MSE: 0

In [None]:

test_feature = test_vectorized_seqs

test = Task1Dataset(test_feature, train_df['meanGrade'][0:len(test_feature)])
test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

predictions = []
best_model = BiLSTM(EMBEDDING_DIM, 256, INPUT_DIM, BATCH_SIZE, device)
best_model.load_state_dict(torch.load('/content/sample_data/BiLSTM_model_1.pth'))

best_model.eval()
with torch.no_grad():
  for batch in test_loader:
    feature, target = batch
    feature, target = feature.to(device), target.to(device)
    feature = torch.LongTensor(feature)
    best_model.batch_size = target.shape[0]
    best_model.hidden = best_model.init_hidden()
    prediction = best_model(feature).squeeze(1)
    pred = prediction.detach().cpu().numpy()  
    pred = np.reshape(pred,(len(pred),1))
    for p in pred:
      predictions.append(p)

predictions = np.asarray(predictions)

In [None]:
import csv
out_loc = '/content/sample_data/task-1-output.csv'
with open(out_loc, "w") as f:
    writer = csv.writer(f)
    writer.writerow(('id','pred'))
    for i in range(len(test_df['id'])):
        writer.writerow((test_df['id'][i],predictions[i][0]))