In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
DATA_DIR = '/content/gdrive/MyDrive/MSc Thesis/Data/'
path_to_model = '/content/gdrive/MyDrive/MSc Thesis/Colab/models/Best individual model.pt'
bert_model = "albert-xxlarge-v2"  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'bert-base-uncased', 'bert-large-uncased' ...
bs = 16  # batch size
mode = "individual"

Mounted at /content/gdrive


## Initial setup

Installing necessary libraries not included in default colab environment

In [2]:
!pip install transformers==3.1.0

Collecting transformers==3.1.0
  Downloading transformers-3.1.0-py3-none-any.whl (884 kB)
[K     |████████████████████████████████| 884 kB 3.8 MB/s 
Collecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 80.7 MB/s 
[?25hCollecting tokenizers==0.8.1.rc2
  Downloading tokenizers-0.8.1rc2-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 76.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 72.1 MB/s 
Installing collected packages: tokenizers, sentencepiece, sacremoses, transformers
Successfully installed sacremoses-0.0.45 sentencepiece-0.1.96 tokenizers-0.8.1rc2 transformers-3.1.0


Importing libraries

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Loading the data

The following functions allow us to load the SemEval-2020 Task 4 Subtask A data either in the form of a sentence pair classification task as was originally intended or as individually labelled sentences.

In [4]:
def load_sentence_pairs(X_path, y_path):
    X = pd.read_csv(X_path).drop(columns=["id"])
    y = pd.read_csv(y_path, header=None).drop(columns=[0])
    X = X.rename(columns={"sent0": "sentence1", "sent1": "sentence2"})
    y = y.rename(columns={1: "label"})
    df = pd.concat([X, y], axis=1)
    # Removing rows where both sentences are the same
    # df=df[df["sentence1"].str.lower()!=df["sentence2"].str.lower()]


    return df

def load_individual_sentences(X_path, y_path):
    X = pd.read_csv(X_path).drop(columns=["id"])
    y = pd.read_csv(y_path, header=None).iloc[: , 1:]

    X_new = []
    y_new = []

    for index, row in y.iterrows():
        # Ignore rows where both sentences are the same
        if X["sent0"][index].lower() != X["sent1"][index].lower():
            X_new.append(X["sent0"][index])
            X_new.append(X["sent1"][index])
            if y[1][index] == 0:
                y_new.append(1)
                y_new.append(0)
            else:
                y_new.append(0)
                y_new.append(1)
        else:
            print(index)
    
    df = pd.DataFrame({"sentence1": X_new, "label": y_new})

    return df

In [5]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='albert-xxlarge-v2'):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = str(self.data.loc[index, 'sentence1'])
        # Account for single sentence or sentence pair problems
        if 'sentence2' in self.data.columns:
            sent2 = str(self.data.loc[index, 'sentence2'])

            # Tokenize the pair of sentences to get token ids, attention masks and token type ids
            encoded = self.tokenizer(sent1, sent2, 
                                        padding='max_length',  # Pad to max_length
                                        truncation=True,  # Truncate to max_length
                                        max_length=self.maxlen,  
                                        return_tensors='pt')  # Return torch.Tensor objects
        else:
            # Tokenize the pair of sentences to get token ids, attention masks and token type ids
            encoded = self.tokenizer(sent1, 
                                        padding='max_length',  # Pad to max_length
                                        truncation=True,  # Truncate to max_length
                                        max_length=self.maxlen,  
                                        return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

## Defining model class and function for converting model outputs to probabilities


In [6]:
class BertClassifier(nn.Module):

    def __init__(self, bert_model="albert-xxlarge-v2", freeze_bert=False, dropout_rate=0.2):
        super(BertClassifier, self).__init__()
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model)

        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768
        elif bert_model == "bert-large-uncased": #336M parameters
            hidden_size = 1024

        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        self.dropout = nn.Dropout(p=dropout_rate)

        # Classification layer
        self.cls_layer = nn.Linear(hidden_size, 1)

        # Activation function
        self.activation = nn.Sigmoid()


    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)

        # Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
        # Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
        # objective during pre-training.
        logits = self.cls_layer(self.dropout(pooler_output))

        return logits

In [7]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy() 

## Testing and Evaluation

In [8]:
def evaluate_test_set(net, device, dataloader):
    net.eval()

    all_preds = pd.Series([], dtype='uint8')
    all_probs = pd.Series([], dtype='uint8')
    all_labels = pd.Series([], dtype='uint8')

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            # Converting data to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            
            # Getting probabilities then predictions from outputted logits
            logits = net(seq, attn_masks, token_type_ids)
            probs = pd.Series(get_probs_from_logits(logits.squeeze(-1)).squeeze(-1).tolist())
            preds=(probs>=0.5).astype('uint8')
            all_probs = all_probs.append(probs, ignore_index=True)
            all_preds = all_preds.append(preds, ignore_index=True)
            # Converting labels to CPU tensor so that it can be converted to Series
            all_labels = all_labels.append(pd.Series(labels.cpu()).astype('uint8'), ignore_index=True)

    return {"accuracy": accuracy_score(all_labels, all_preds), "precision": precision_score(all_labels, all_preds),
            "recall": recall_score(all_labels, all_preds), "f1": f1_score(all_labels, all_preds),
            "AUROC": roc_auc_score(all_labels, all_probs)}

In [9]:
def evaluate_individual_sentence(net, device, tokenizer, sent):
    # The best performing model on the test set with individual sentences was the model
    # from the initial experimentation (before hyperparameter tuning)
    with torch.no_grad():
        encoded = tokenizer(sent, padding='max_length', truncation=True, max_length=32, return_tensors='pt')
        token_ids = encoded['input_ids'].to(device)
        attn_masks = encoded['attention_mask'].to(device)
        token_type_ids = encoded['token_type_ids'].to(device)
        logits = net(token_ids, attn_masks, token_type_ids)
        probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)

        if probs[0] > 0.5:
            print("Input sentence:", sent)
            print("Prediction: Nonsensical")
            print("Prediction probability:", round(probs[0]*100, 2))
        elif probs[0] < 0.5:
            print("Input sentence:", sent)
            print("Prediction: Sensical")
            print("Prediction probability:", round((1-probs[0])*100, 2))
        else:
            print("Input sentence:", sent)
            print("Prediction: Uncertain")
        print("")


In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = BertClassifier(bert_model)
net.load_state_dict(torch.load(path_to_model))
net.to(device)
tokenizer = AutoTokenizer.from_pretrained(bert_model)

Downloading:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/893M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

In [11]:
set_seed(8)

if torch.cuda.is_available():
    torch.cuda.empty_cache()

if mode == "pairs":
    maxlen = 64 
    test_df = load_sentence_pairs(DATA_DIR+'test_data.csv', DATA_DIR+'test_labels.csv')
elif mode == "individual":
    maxlen = 32
    test_df = load_individual_sentences(DATA_DIR+'test_data.csv', DATA_DIR+'test_labels.csv')
else:
    print("WARNING: invalid running mode, please select 'pairs' or 'individual'")

test_set = CustomDataset(test_df, maxlen, bert_model)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=2, shuffle=True)

results = evaluate_test_set(net, device, test_loader)
print('')
print(results)

100%|██████████| 125/125 [00:57<00:00,  2.19it/s]


{'accuracy': 0.8925, 'precision': 0.9234088457389428, 'recall': 0.856, 'f1': 0.888427607680332, 'AUROC': 0.956616}





In [12]:
evaluate_individual_sentence(net, device, tokenizer, "The sky is blue")

Input sentence: The sky is blue
Prediction: Sensical
Prediction probability: 98.09

