<a href="https://colab.research.google.com/github/XiangdiChai/nlp-cw-code-repo/blob/master/BERT_and_RoBERTa_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# You will need to download any word embeddings required for your code, e.g.:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove.6B.zip

# For any packages that Colab does not provide auotmatically you will also need to install these below, e.g.:

!pip install torch
!pip install transformers



In [None]:
# Imports packages

import torch
import torch.nn as nn
from torch.utils.data import Dataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

import pandas as pd
import numpy as np
import codecs

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import RobertaTokenizer, RobertaForSequenceClassification


In [None]:
# Setting random seed and device
SEED = 1

np.random.seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda_dev = '0'
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(f'Using {device}')

if use_cuda:
    print('GPU: ' + str(torch.cuda.get_device_name(int(cuda_dev))))     

Using cuda:0
GPU: Tesla K80


In [None]:
# Load data
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('dev.csv') 
test_df = pd.read_csv('test.csv') 

In [None]:
# define hyperparameter

epochs = 2
batch_size = 32
learning_rate = 3e-5

In [None]:
### preprocessing data


## find the start and end position of the key word in original sentence
def find_key_word(original):
  start_position = original.find('<')
  end_position = original.find('>')
  return start_position, end_position

## replace the word with the substitution word
def replace_key_word(original,edit):
  start_position, end_position = find_key_word(original)
  result = original.replace(original[start_position : end_position+1],edit)
  return result

# extract the substitution word
def extract_key_word(original):
  start_position, end_position = find_key_word(original)
  result = original[start_position+1 : end_position-1]
  return result

# using above functions to produce input
def preprocessing(dataset):
  dataset['replaced'] = dataset.apply(lambda x: extract_key_word(x['original']), axis = 1)
  dataset['context']  = dataset.apply(lambda x : replace_key_word(x['original'],x['edit']), axis = 1)
  dataset['add_humour'] = dataset.apply(lambda x:x['context'] + ' [SEP] '+ x['edit'] + ' was humorous',axis=1)

# preprocess our training, validation and testing data

preprocessing(train_df)
preprocessing(val_df)
preprocessing(test_df)

In [None]:
# We define our training loop

def train(train_loader, model, epochs, val_dataloader, tokenizer, optimizer, scheduler):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """
    print("Training model.")
    best_rmse = 1
    # begin training
    for epoch in range(1, epochs+1):

        # store prediction and ture label
        pre_array = np.array([])
        true_array = np.array([])

        # record the progress of our training
        counter = 0
        model.train()

        # for each batch we do
        for batch in train_loader:
          input_ids, attention_masks = collate_fn(list(batch[0]), tokenizer)
          input_ids = input_ids.to(device)
          attention_masks = attention_masks.to(device)
          labels = batch[1].to(device)
          model.zero_grad()  

          # obtain output      
          outputs = model(input_ids, attention_mask = attention_masks, labels = labels)
          
          # extract loss and logits(predicitons) from outputs
          loss = outputs.loss
          logits = outputs.logits

          # add predictions and labels to array
          logits = logits.detach().cpu().numpy()
          labels = labels.cpu().numpy()
          pre_array = np.append(pre_array,logits)
          true_array = np.append(true_array,labels)

          # update model parameters and do optimisation
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
          optimizer.step()
          scheduler.step()

          # calculate training rmse
          _,mse,rmse = model_performance(logits,labels)
          counter += 1
          

          # report the progress of the training and validation results
          if counter % 20 == 0:
              val_rmse, avg_val_loss = eval(val_loader, model, tokenizer)
              print(f'|Epoch: {epoch} |Batch: {counter} | Total Batch: {len(train_loader)} | Train Loss: {loss:.4f} | Train RMSE: {rmse:.4f} | Val Loss: {avg_val_loss:.4f} | Val RMSE: {val_rmse:.4f} |')
              if val_rmse < best_rmse:
                best_rmse = val_rmse
                torch.save(model.state_dict(), 'best_model.pt')

In [None]:
# How we print the model performance

def model_performance(output, target):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """
    sq_error = (output - target)**2
    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    return sse, mse, rmse

In [None]:
# We evaluate performance on our dev set

def eval(val_loader, model, tokenizer):
    """
    Evaluating model performance on the dev set
    """
    # enter evaluation model
    model.eval()

    # store model prediction and true label
    sum_loss = 0
    pre_array = np.array([])
    true_array = np.array([])

    for batch in val_loader:

        # tokenize each batch
        input_ids, attention_masks = collate_fn(list(batch[0]), tokenizer)
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = batch[1].to(device)

        # obtain output
        with torch.no_grad():        
            outputs = model(input_ids, attention_mask=attention_masks, labels= labels)
        
        # extract loss from outputs
        loss = outputs.loss
        sum_loss += loss.item()

        # extract logits(predictions) from outputs 
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        labels = labels.cpu().numpy()

        # add predictions and labels to array
        pre_array = np.append(pre_array,logits)
        true_array = np.append(true_array,labels)

     # calculate validation rmse   
    _,_,rmse = model_performance(pre_array,true_array)

    avg_loss = sum_loss / len(val_loader)

    return rmse, avg_loss

In [None]:
# We evaluate performance on our test set

def test(final_model, test_loader, tokenizer):
  '''
  Testing model performance on the dev set
  '''
  # evaluation model
  final_model.eval()
  pre_array = np.array([])
  for batch in test_loader:
    # obtain input_id and attention_mask
    input_ids, attention_masks = collate_fn(list(batch), tokenizer)

    # to device
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)

    # obtain the output from final_model
    with torch.no_grad():        
        outputs = final_model(input_ids, attention_mask=attention_masks)
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    
    # add prediction of each input
    pre_array = np.append(pre_array,logits)

  return pre_array

In [None]:
## get input_id and attention_mask of each batch
def collate_fn(batch, tokenizer):
  '''
  Tokenize our batch
  '''
  # list stored the input_id and attention_mask
  input_ids = []
  attention_masks = []

  # implement tokenization for each row of a batch
  for row in batch:
    encodings = tokenizer(row, add_special_tokens = True, max_length = 32, return_tensors = 'pt', truncation=True, padding='max_length')
    input_ids.append(encodings['input_ids'])
    attention_masks.append(encodings['attention_mask'])

  # convert list to tensor
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return input_ids, attention_masks



## change dataframe to readable dataset
class Task1Dataset(Dataset):

    def __init__(self, df, training = True):
        # context = x
        self.x_train = df['context']
        # self.x_train = df['add_humour']
        # if we are in the training phrase
        self.training = training
        if self.training:
          # label = y
          self.y_train = df['meanGrade']

    def __len__(self):
        return len(self.x_train)

    def __getitem__(self, item):
        if self.training:
          return self.x_train[item], self.y_train[item]
        else:
          return self.x_train[item]


In [None]:
#### define our model

## model can be Bert or Roberta

## we are doing regression problem only having one label (num_labels = 1)

# for bert
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased',  num_labels = 1)
model_bert.cuda()
model_bert = model_bert.double()

# for roberta
model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels = 1)
model_roberta.cuda()
model_roberta = model_roberta.double()


optimizer_bert = AdamW(model_bert.parameters(),lr = learning_rate, eps = 1e-8)
optimizer_roberta = AdamW(model_roberta.parameters(),lr = learning_rate, eps = 1e-8)

# construct tokenizer for bert and roberta model
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')


# construct dataset and dataloader
train_dataset = Task1Dataset(train_df)
train_loader = torch.utils.data.DataLoader(train_dataset,  shuffle = True, batch_size = batch_size)
val_dataset = Task1Dataset(val_df)
val_loader = torch.utils.data.DataLoader(val_dataset,  shuffle = False, batch_size = batch_size)
test_dataset = Task1Dataset(test_df,False)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle = False, batch_size = batch_size)

# computer total steps
total_steps = len(train_loader) * epochs
scheduler_bert = get_linear_schedule_with_warmup(optimizer_bert, num_warmup_steps = 0, num_training_steps = total_steps)
scheduler_roberta = get_linear_schedule_with_warmup(optimizer_roberta, num_warmup_steps = 0, num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
##Â Approach 1 code, using functions defined above:
# train for roberta
train(train_loader, model_roberta, epochs, val_loader, tokenizer_roberta, optimizer_roberta, scheduler_roberta)

Training model.
|Epoch: 1 |Batch: 20 | Total Batch: 302 | Train Loss: 0.5145 | Train RMSE: 0.7023 | Val Loss: 0.3438 | Val RMSE: 0.5862 |
|Epoch: 1 |Batch: 40 | Total Batch: 302 | Train Loss: 0.3550 | Train RMSE: 0.6046 | Val Loss: 0.3237 | Val RMSE: 0.5689 |
|Epoch: 1 |Batch: 60 | Total Batch: 302 | Train Loss: 0.4980 | Train RMSE: 0.7222 | Val Loss: 0.3426 | Val RMSE: 0.5848 |
|Epoch: 1 |Batch: 80 | Total Batch: 302 | Train Loss: 0.3188 | Train RMSE: 0.5811 | Val Loss: 0.3299 | Val RMSE: 0.5738 |
|Epoch: 1 |Batch: 100 | Total Batch: 302 | Train Loss: 0.4303 | Train RMSE: 0.6508 | Val Loss: 0.3054 | Val RMSE: 0.5524 |
|Epoch: 1 |Batch: 120 | Total Batch: 302 | Train Loss: 0.2720 | Train RMSE: 0.5542 | Val Loss: 0.2911 | Val RMSE: 0.5392 |
|Epoch: 1 |Batch: 140 | Total Batch: 302 | Train Loss: 0.3432 | Train RMSE: 0.6111 | Val Loss: 0.3168 | Val RMSE: 0.5621 |
|Epoch: 1 |Batch: 160 | Total Batch: 302 | Train Loss: 0.2336 | Train RMSE: 0.5654 | Val Loss: 0.2864 | Val RMSE: 0.5343 |
|Epo

In [None]:


####################### implement model on test data (need test dataset with label)

# true_df = pd.read_csv('test with label.csv') 
# model_roberta.load_state_dict(torch.load('best_model.pt'))
# prediction_roberta = test(model_roberta, test_loader, tokenizer_roberta)
# _, _, rmse_test_roberta = model_performance(prediction_roberta, np.array(true_df['meanGrade']))
# print(rmse_test_roberta)




In [None]:
# train for bert
train(train_loader, model_bert, epochs, val_loader, tokenizer_bert, optimizer_bert, scheduler_bert)

Training model.
|Epoch: 1 |Batch: 20 | Total Batch: 302 | Train Loss: 0.4002 | Train RMSE: 0.6331 | Val Loss: 0.3351 | Val RMSE: 0.5785 |
|Epoch: 1 |Batch: 40 | Total Batch: 302 | Train Loss: 0.5267 | Train RMSE: 0.7263 | Val Loss: 0.3286 | Val RMSE: 0.5728 |
|Epoch: 1 |Batch: 60 | Total Batch: 302 | Train Loss: 0.2042 | Train RMSE: 0.4775 | Val Loss: 0.3257 | Val RMSE: 0.5703 |
|Epoch: 1 |Batch: 80 | Total Batch: 302 | Train Loss: 0.3630 | Train RMSE: 0.5864 | Val Loss: 0.3190 | Val RMSE: 0.5644 |
|Epoch: 1 |Batch: 100 | Total Batch: 302 | Train Loss: 0.2311 | Train RMSE: 0.5226 | Val Loss: 0.3093 | Val RMSE: 0.5557 |
|Epoch: 1 |Batch: 120 | Total Batch: 302 | Train Loss: 0.1876 | Train RMSE: 0.4921 | Val Loss: 0.2965 | Val RMSE: 0.5441 |
|Epoch: 1 |Batch: 140 | Total Batch: 302 | Train Loss: 0.1956 | Train RMSE: 0.5010 | Val Loss: 0.2928 | Val RMSE: 0.5407 |
|Epoch: 1 |Batch: 160 | Total Batch: 302 | Train Loss: 0.3536 | Train RMSE: 0.6163 | Val Loss: 0.2915 | Val RMSE: 0.5392 |
|Epo

In [None]:
####################### implement model on test data (need test dataset with label)

# true_df = pd.read_csv('test with label.csv') 
# prediction_bert = test(model_bert, test_loader, tokenizer_bert)
# _, _, rmse_test_bert = model_performance(prediction_bert, np.array(true_df['meanGrade']))
# print(rmse_test_bert)