In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
DIR_PATH = '/content/gdrive/MyDrive/MSc Thesis/Colab/'
DATA_DIR = '/content/gdrive/MyDrive/MSc Thesis/Data/'

# Hyperparameters

In [None]:
bert_model = "albert-xxlarge-v2"  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'bert-base-uncased', 'bert-large-uncased'
freeze_bert = False  # if True, freeze the encoder weights and only update the classification layer weights
bs = 16  # batch size
iters_to_accumulate = 1  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
lr = 1e-5  # learning rate
weight_decay = 1e-2
epochs = 5  # maximum number of training epochs
dropout_rate = 0.2
mode="individual" # 'pairs' or 'individual'

# Initial setup

Installing necessary libraries not included in default colab environment

In [None]:
!pip install datasets==1.0.1
!pip install transformers==3.1.0
!pip install tensorboard

Importing libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Loading the data

The following functions allow us to load the SemEval-2020 Task 4 Subtask A data either in the form of a sentence pair classification task as was originally intended or as individually labelled sentences.

In [None]:
def load_sentence_pairs(X_path, y_path):
    X = pd.read_csv(X_path).drop(columns=["id"])
    y = pd.read_csv(y_path, header=None).drop(columns=[0])
    X = X.rename(columns={"sent0": "sentence1", "sent1": "sentence2"})
    y = y.rename(columns={1: "label"})
    df = pd.concat([X, y], axis=1)


    return df

def load_individual_sentences(X_path, y_path):
    X = pd.read_csv(X_path).drop(columns=["id"])
    y = pd.read_csv(y_path, header=None).iloc[: , 1:]

    X_new = []
    y_new = []

    for index, row in y.iterrows():
        # Ignore rows where both sentences are the same
        if X["sent0"][index].lower() != X["sent1"][index].lower():
            X_new.append(X["sent0"][index])
            X_new.append(X["sent1"][index])
            if y[1][index] == 0:
                y_new.append(1)
                y_new.append(0)
            else:
                y_new.append(0)
                y_new.append(1)
        else:
            print(index)
    
    df = pd.DataFrame({"sentence1": X_new, "label": y_new})

    return df

def MaxWords(df):
    lensentences = []
    lenpairs = []
    for index, row in df.iterrows():
        sen1len = len(df["sentence1"][index].split())
        sen2len = len(df["sentence2"][index].split())
        totallen = sen1len+sen2len
        lensentences.append(sen1len)
        lensentences.append(sen2len)
        lenpairs.append(totallen)
    print("Max length of sentence pairs: ", max(lenpairs))
    print("Max length of individual sentence: ", max(lensentences))

In [None]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='albert-xxlarge-v2'):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = str(self.data.loc[index, 'sentence1'])
        # Account for single sentence or sentence pair problems
        if 'sentence2' in self.data.columns:
            sent2 = str(self.data.loc[index, 'sentence2'])

            # Tokenize the pair of sentences to get token ids, attention masks and token type ids
            encoded = self.tokenizer(sent1, sent2, 
                                        padding='max_length',  # Pad to max_length
                                        truncation=True,  # Truncate to max_length
                                        max_length=self.maxlen,  
                                        return_tensors='pt')  # Return torch.Tensor objects
        else:
            # Tokenize the sentence to get token ids, attention masks and token type ids
            encoded = self.tokenizer(sent1, 
                                        padding='max_length',  # Pad to max_length
                                        truncation=True,  # Truncate to max_length
                                        max_length=self.maxlen,  
                                        return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

## Defining functions for training

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, bert_model="albert-xxlarge-v2", freeze_bert=False, dropout_rate=0.2):
        super(BertClassifier, self).__init__()
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model)

        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768
        elif bert_model == "bert-large-uncased": #336M parameters
            hidden_size = 1024

        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        self.dropout = nn.Dropout(p=dropout_rate)

        # Classification layer
        self.cls_layer = nn.Linear(hidden_size, 1)


    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)

        # Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
        # Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
        # objective during pre-training.
        logits = self.cls_layer(self.dropout(pooler_output))

        return logits

In [None]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0
    all_preds = pd.Series([], dtype='uint8')
    all_labels = pd.Series([], dtype='uint8')

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            # Converting data to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            
            # Passing input data through network and calculating loss from output + labels
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()

            # Getting predictions from outputted probabilities
            probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1).tolist()
            preds=(pd.Series(probs)>=0.5).astype('uint8')
            all_preds = all_preds.append(preds, ignore_index=True)
            # Converting labels to CPU tensor so that it can be converted to Series
            all_labels = all_labels.append(pd.Series(labels.cpu()).astype('uint8'), ignore_index=True)

            count += 1
    return mean_loss / count, accuracy_score(all_labels, all_preds)

In [None]:
def train_bert(net, bert_model, criterion, opti, lr, weight_decay, train_loader, val_loader, epochs, iters_to_accumulate, mode = "pairs", lr_scheduler = None):

    best_loss = np.Inf
    best_acc = 0
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch

    scaler = GradScaler()
    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):

            # Converting to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            # Enables autocasting for the forward pass (model + loss)
            with autocast():
                # Obtaining the logits from the model
                logits = net(seq, attn_masks, token_type_ids)

                # Computing loss
                loss = criterion(logits.squeeze(-1), labels.float())
                # Log training loss after every batch
                # writer.add_scalar("Train/loss", loss.item(), (it+1)+((ep)*nb_iterations))
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged

            # Backpropagating the gradients
            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                if lr_scheduler is not None:
                    # Adjust the learning rate based on the number of iterations.
                    lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()


            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss, val_acc = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation Loss : {}, Validation Accuracy : {}".format(ep+1, val_loss, val_acc))
        # Log validation loss and accuracy after every epoch
        # writer.add_scalar("Validation/loss", val_loss, ep+1)
        # writer.add_scalar("Validation/accuracy", val_acc, ep+1)

        # Running early stopping based on validation loss and accuracy since validation loss often seemed to get worse while validation accuracy improves in experimentation done
        if val_acc > best_acc or val_loss < best_loss:
            print("Best epoch validation accuracy changed from {} to {}".format(best_acc, val_acc))
            print("Best epoch validation loss changed from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_acc = val_acc
            best_ep = ep + 1
        else:
            # Early stopping after one bad epoch (decision made based on observations from experimentation done)
            print("Stopping training due to deterioration in validation loss and accuracy from previous epoch")
            break

    # Saving the model
    model_name = '{}_{}_lr_{}_wd_{}.pt'.format(bert_model, mode, lr, weight_decay)
    path_to_model=DIR_PATH+model_name
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))


    # writer.close()
    del loss
    torch.cuda.empty_cache()

# Training

Training with hyperparameter tuning:

In [None]:
# set_seed(8)
# torch.cuda.empty_cache()

# # "albert-xxlarge-v2", "bert-large-uncased"
# for bert_model in ["bert-large-uncased"]:
#     # "individual", "pairs"
#     for mode in ["pairs", "individual"]:
#         # 1e-1, 1e-2
#         for weight_decay in [1e-1, 1e-2]:
#             # 5e-5, 4e-5, 3e-5, 2e-5, 1e-5, LR values from BERT paper and CN.HIT.IT-NLP paper
#             for lr in [5e-5, 4e-5, 3e-5, 2e-5,1e-5]:
#                 #
#                 if mode == "pairs":
#                     maxlen = 64  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
#                     train_df = load_sentence_pairs(DATA_DIR+'train_data.csv', DATA_DIR+'train_labels.csv')
#                     val_df = load_sentence_pairs(DATA_DIR+'val_data.csv', DATA_DIR+'val_labels.csv')
#                 elif mode == "individual":
#                     maxlen = 32  # maximum length of the tokenized input sentence : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
#                     train_df = load_individual_sentences(DATA_DIR+'train_data.csv', DATA_DIR+'train_labels.csv')
#                     val_df = load_individual_sentences(DATA_DIR+'val_data.csv', DATA_DIR+'val_labels.csv')
#                 else:
#                     print("WARNING: invalid running mode, please select 'pairs' or 'individual'")

#                 train_set = CustomDataset(train_df, maxlen, bert_model)
#                 val_set = CustomDataset(val_df, maxlen, bert_model)
#                 # Creating instances of training and validation dataloaders
#                 train_loader = DataLoader(train_set, batch_size=bs, num_workers=2, shuffle=True)
#                 val_loader = DataLoader(val_set, batch_size=bs, num_workers=2, shuffle=True)

#                 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#                 net = BertClassifier(bert_model, freeze_bert=False, dropout_rate=dropout_rate)

#                 net.to(device)

#                 criterion = nn.BCEWithLogitsLoss()
#                 opti = AdamW(net.parameters(), lr=lr, weight_decay=weight_decay)

#                 # num_training_steps = epochs * len(train_loader)  # The total number of training steps
#                 # Setting warmup steps to 0.2 proportion of total training steps as done by Zhang et al.
#                 # num_warmup_steps = 0.2 * num_training_steps # The number of steps for the warmup phase.
#                 # t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
#                 # lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

#                 train_bert(net, bert_model, criterion, opti, lr, weight_decay, train_loader, val_loader, epochs, iters_to_accumulate, mode)

Individual training:

In [None]:
set_seed(8)
#
if mode == "pairs":
    maxlen = 64  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
    train_df = load_sentence_pairs(DATA_DIR+'train_data.csv', DATA_DIR+'train_labels.csv')
    val_df = load_sentence_pairs(DATA_DIR+'val_data.csv', DATA_DIR+'val_labels.csv')
elif mode == "individual":
    maxlen = 32  # maximum length of the tokenized input sentence : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
    train_df = load_individual_sentences(DATA_DIR+'train_data.csv', DATA_DIR+'train_labels.csv')
    val_df = load_individual_sentences(DATA_DIR+'val_data.csv', DATA_DIR+'val_labels.csv')
else:
    print("WARNING: invalid running mode, please select 'pairs' or 'individual'")

train_set = CustomDataset(train_df, maxlen, bert_model)
val_set = CustomDataset(val_df, maxlen, bert_model)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=2, shuffle=True)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=2, shuffle=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = BertClassifier(bert_model, freeze_bert=False, dropout_rate=dropout_rate)

net.to(device)

criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=weight_decay)

# num_training_steps = epochs * len(train_loader)  # The total number of training steps
# Setting warmup steps to 0.2 proportion of total training steps as done by Zhang et al.
# num_warmup_steps = 0.2 * num_training_steps # The number of steps for the warmup phase.
# t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
# lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_bert(net, bert_model, criterion, opti, lr, weight_decay, train_loader, val_loader, epochs, iters_to_accumulate, mode)