# Top

In [None]:
!pip install transformers
from google.colab import drive
drive.mount('/content/drive')

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |▏                               | 10kB 24.5MB/s eta 0:00:01[K     |▎                               | 20kB 31.6MB/s eta 0:00:01[K     |▍                               | 30kB 37.0MB/s eta 0:00:01[K     |▋                               | 40kB 29.2MB/s eta 0:00:01[K     |▊                               | 51kB 25.4MB/s eta 0:00:01[K     |▉                               | 61kB 27.4MB/s eta 0:00:01[K     |█                               | 71kB 24.2MB/s eta 0:00:01[K     |█▏                              | 81kB 25.2MB/s eta 0:00:01[K     |█▎                              | 92kB 24.1MB/s eta 0:00:01[K     |█▌                              | 102kB 25.4MB/s eta 0:00:01[K     |█▋                              | 112kB 25.4MB/s eta 0:00:01[K     |█▊                              | 

In [None]:
import os
import pandas as pd

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

from transformers import RobertaModel, RobertaTokenizer, AdamW

from tqdm.auto import tqdm

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns

import random
import numpy as np

cur_dir = "/content/drive/MyDrive/Colab_Notebooks/NLP_244_Advanced_ML/final_project_socialiqa/socialiqa-nlp244"
data_dir_siqa = "socialiqa-train-dev"
data_dir_hs = "hellaswag-train-dev"
data_dir_anli = "alphanli-train-dev"

out_dir = "out"

np_seed = np.random.seed(27)
torch.manual_seed(27)
random.seed(27)

# Data

##Extract aNLI data

Getting data and corresponding labels from dev and train split


In [None]:
file_train_anli = os.path.join(cur_dir, data_dir_anli, "train.jsonl")
file_dev_anli = os.path.join(cur_dir, data_dir_anli, "dev.jsonl")

json_train_anli = pd.read_json(file_train_anli, lines=True)
json_dev_anli = pd.read_json(file_dev_anli, lines=True)

# list of tuples (obs1, obs2, hyp1, hyp2)
train_data_anli = [elem for elem in zip(json_train_anli['obs1'].tolist(), 
                                   json_train_anli['obs2'].tolist(), 
                                   json_train_anli['hyp1'].tolist(), 
                                   json_train_anli['hyp2'].tolist())]

dev_data_anli = [elem for elem in zip(json_dev_anli['obs1'].tolist(), 
                                   json_dev_anli['obs2'].tolist(), 
                                   json_dev_anli['hyp1'].tolist(), 
                                   json_dev_anli['hyp2'].tolist())]

print(len(train_data_anli), len(dev_data_anli))

# Labels is a list of integers either 0, 1
train_labels_anli = []
dev_labels_anli = []
with open(os.path.join(cur_dir, data_dir_anli, "train-labels.lst")) as f:
    for line in f:
      train_labels_anli.append(int(line.split()[0]))

with open(os.path.join(cur_dir, data_dir_anli, "dev-labels.lst")) as f:
    for line in f:
      dev_labels_anli.append(int(line.split()[0]))

train_labels_anli = [label-1 for label in train_labels_anli]
dev_labels_anli = [label-1 for label in dev_labels_anli]

print(len(train_labels_anli), len(dev_labels_anli))

169654 1532
169654 1532


In [None]:
class aNliDataset(Dataset):
    """
    This dataset class is for preparing data for aNLI.

    This is the context/question + multiple choice format, and each example
    consists of num choices lists of encoded strings. Note that the input will
    be encoded and padded in this stage. prepare_batch will take care of padding 
    across examples in the batch-level. 
    """
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (obs1, obs2, hyp1, hyp2)
        # y: list of indices of the correct answer
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        point = self.x[idx]
        # format for aNLI is: obs1 hyp1 obs 2, obs1 hyp2 obs2
        input_context_question = [point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[2], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[3]]
        input_answers = [point[1], point[1]]
        encoded_text_train = self.roberta_tokenizer(input_context_question, input_answers, return_tensors='pt', padding=True)
        return (encoded_text_train, self.y[idx])

    def __len__(self):
        return len(self.x)


## Extract HellaSwag data

Getting data and corresponding labels from dev and train split

In [None]:
file_train_hs = os.path.join(cur_dir, data_dir_hs, "train.jsonl")
file_dev_hs = os.path.join(cur_dir, data_dir_hs, "valid.jsonl")

json_train_hs = pd.read_json(path_or_buf=file_train_hs, lines=True)
json_dev_hs = pd.read_json(path_or_buf=file_dev_hs, lines=True)

# list of tuples (context, 
#                [ending option 1, 
#                 ending option 2, 
#                 ending option 3, 
#                 ending option 4])
train_data_hs = [elem for elem in zip(json_train_hs['ctx'].tolist(), 
                                   json_train_hs['ending_options'].tolist())]

dev_data_hs = [elem for elem in zip(json_dev_hs['ctx'].tolist(), 
                                   json_dev_hs['ending_options'].tolist())]

print(len(train_data_hs), len(dev_data_hs))

# Labels is a list of integers either 0, 1, 2, 3
train_labels_hs = []
dev_labels_hs = []
with open(os.path.join(cur_dir, data_dir_hs, "train-labels.lst")) as f:
    for line in f:
      train_labels_hs.append(int(line.split()[0]))

with open(os.path.join(cur_dir, data_dir_hs, "valid-labels.lst")) as f:
    for line in f:
      dev_labels_hs.append(int(line.split()[0]))

train_labels_hs = [label for label in train_labels_hs]
dev_labels_hs = [label for label in dev_labels_hs]

print(len(train_labels_hs), len(dev_labels_hs))

ValueError: ignored

In [None]:
class HellaSwagDataset(Dataset):
    """
    This dataset class is for preparing data for HellaSwag.

    This is the context + multiple choice format, and each example
    consists of num choices lists of encoded strings. Note that the input will
    be encoded and padded in this stage. prepare_batch will take care of padding 
    across examples in the batch-level. 
    """
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (context, 
        #                               [ending option 1, 
        #                                ending option 2, 
        #                                ending option 3, 
        #                                ending option 4])
        # y: list of indices of the correct answer 0, 1, 2, or 3
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        point = self.x[idx]
        input_context = [point[0], point[0], point[0], point[0]]
        input_answers = [point[1][0], point[1][1], point[1][2], point[1][3]]
        encoded_text_train = self.roberta_tokenizer(input_context, input_answers, return_tensors='pt', padding=True)
        return (encoded_text_train, self.y[idx])

    def __len__(self):
        return len(self.x)


## Extract SocialIQA data

Getting data and corresponding labels from dev and train split

In [None]:
file_train_siqa = os.path.join(cur_dir, data_dir_siqa, "train.jsonl")
file_dev_siqa = os.path.join(cur_dir, data_dir_siqa, "dev.jsonl")

json_train = pd.read_json(path_or_buf=file_train_siqa, lines=True)
json_dev = pd.read_json(path_or_buf=file_dev_siqa, lines=True)

# Data is a list of tuples (context, question, A, B, C)
train_data_siqa = [elem for elem in zip(json_train['context'].tolist(), 
                                   json_train['question'].tolist(), 
                                   json_train['answerA'].tolist(), 
                                   json_train['answerB'].tolist(), 
                                   json_train['answerC'].tolist())]

dev_data_siqa = [elem for elem in zip(json_dev['context'].tolist(), 
                                   json_dev['question'].tolist(), 
                                   json_dev['answerA'].tolist(), 
                                   json_dev['answerB'].tolist(), 
                                   json_dev['answerC'].tolist())]

print(len(train_data_siqa), len(dev_data_siqa))

# Labels is a list of integers either 0, 1, 2
train_labels_siqa = []
dev_labels_siqa = []
with open(os.path.join(cur_dir, data_dir_siqa, "train-labels.lst")) as f:
    for line in f:
      train_labels_siqa.append(int(line.split()[0]))

with open(os.path.join(cur_dir, data_dir_siqa, "dev-labels.lst")) as f:
    for line in f:
      dev_labels_siqa.append(int(line.split()[0]))

train_labels_siqa = [label-1 for label in train_labels_siqa]
dev_labels_siqa = [label-1 for label in dev_labels_siqa]

print(len(train_labels_siqa), len(dev_labels_siqa))

ValueError: ignored

In [None]:
class SocialiqaDataset(Dataset):
    """
    This dataset class is for preparing data for socialiqa.

    This is the context/question + multiple choice format, and each example
    consists of num choices lists of encoded strings. Note that the input will
    be encoded and padded in this stage. prepare_batch will take care of padding 
    across examples in the batch-level. 
    """
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (context, question, answer1, answer2, answer3)
        # y: list of indices of the correct answer
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        point = self.x[idx]
        input_context_question = [point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1]]
        input_answers = [point[2], point[3], point[4]]
        encoded_text_train = self.roberta_tokenizer(input_context_question, input_answers, return_tensors='pt', padding=True)
        return (encoded_text_train, self.y[idx])

    def __len__(self):
        return len(self.x)


## Prepare batch MC (all)

In [None]:
def prepare_batch_MC(batch, tokenizer):
    """
    This collate function will pad the batch to be the same length. This requires
    flattening, then unflattening for the multiple choice format.
    One example will be a list of length 'num choices', each element being a list
    of (encoded) tokens representing qustion/answer [sep] choicex
    """
    # batch: [batch_size, (text, label)]
    batch_size = len(batch)

    features, labels = zip(*batch)
    # features: tuple of length batch_size, 
    #        each element is a dict with keys = ["input_ids", "attention_mask"]
    # labels: tuple of int indicies length batch_size
    num_choices = len(features[0]["input_ids"])
    
    # flatten
    flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
    flattened_features = sum(flattened_features, [])
    # flattened_features list length num_choices*batch_size

    batch = tokenizer.pad(
            flattened_features,
            padding=True,
            return_tensors="pt",
        )
    
    # Un-flatten
    batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
    return (batch, torch.tensor(labels, dtype=torch.int64))


# Model Class and Training Class

In [None]:
class Multiple_Choice_Model(nn.Module):
    def __init__(self, roberta_model: RobertaModel, dropout: float = None):
          super(Multiple_Choice_Model, self).__init__()
          self.roberta = roberta_model
          self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob)
          self.classifier = nn.Linear(self.roberta.config.hidden_size, 1)
   
    def forward(self, input_ids: torch.tensor, attention_mask: torch.tensor, labels=None):
          num_choices = input_ids.shape[1] 
          flat_input_ids = input_ids.view(-1, input_ids.size(-1))
          flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))

          outputs = self.roberta(
              input_ids = flat_input_ids,
              attention_mask=flat_attention_mask,
          )
          pooled_output = outputs[1] 

          pooled_output = self.dropout(pooled_output)
          logits = self.classifier(pooled_output)
          reshaped_logits = logits.view(-1, num_choices)

          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(reshaped_logits, labels)

          return loss, reshaped_logits

In [None]:
from sklearn.metrics import classification_report

class Trainer(object):
    """
    Trainer for training a multiple choice classification model
    """

    def __init__(self, model, optimizer, device="cpu"):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.device = device

    def _print_summary(self):
        print(self.model)
        print(self.optimizer)

    def train(self, loader):
        """
        Run a single epoch of training
        """

        self.model.train() # Run model in training mode
        loss = None

        epoch_true_labels = []
        epoch_preds = []
        for i, batch in tqdm(enumerate(loader)):
            # clear gradient
            self.optimizer.zero_grad() 
            # input_ids shape: (batch_size, num_choices, sequence_length)
            input_ids = batch[0]['input_ids'].to(self.device)
            # input_ids shape: (batch_size, num_choices, sequence_length)
            attention_mask = batch[0]['attention_mask'].to(self.device)
            # labels shape: (batch_size, )
            labels = batch[1].to(self.device)

            outputs = self.model(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels=labels)
            loss, logits = outputs[0], outputs[1]
            
            epoch_true_labels.extend(labels.tolist())
            epoch_preds.extend(torch.argmax(nn.Softmax(dim=1)(logits), dim=1).tolist())
            
            # back propagation
            loss.backward()
            # do gradient descent
            self.optimizer.step() 

        # Just returning the last loss
        return loss, epoch_true_labels, epoch_preds

    def evaluate(self, loader):
        """
        Evaluate the model on a validation set.
        Only do batch size = 1.
        """

        self.model.eval() # Run model in eval mode (disables dropout layer)
        loss = None

        epoch_true_labels = []
        epoch_preds = []
        with torch.no_grad(): # Disable gradient computation - required only during training
            for i, batch in tqdm(enumerate(loader)):
                # input_ids shape: (batch_size, num_choices, sequence_length)
                input_ids = batch[0]['input_ids'].to(self.device)
                # input_ids shape: (batch_size, num_choices, sequence_length)
                attention_mask = batch[0]['attention_mask'].to(self.device)
                # labels shape: (batch_size, )
                labels = batch[1].to(self.device)

                outputs = self.model(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                labels=labels)
                loss, logits = outputs[0], outputs[1]
                
                epoch_true_labels.extend(labels.tolist())
                epoch_preds.extend(torch.argmax(nn.Softmax(dim=1)(logits), dim=1).tolist())
            
        # Just returning the last loss
        return loss, epoch_true_labels, epoch_preds

    def get_model_dict(self):
        return self.model.state_dict()

    def run_training(self, train_loader, valid_loader, save_location, dataset, n_epochs=3):
        # Useful for us to review what experiment we're running
        # Normally, you'd want to save this to a file
        # self._print_summary()
        losses_valid = []
        losses_train = []
        best_valid = float("inf")
        for i in range(n_epochs):
            target_names = None
            if dataset == 'hs':
                target_names = ['Ending Option 1', 'Ending Option 2', 'Ending Option 3', 'Ending Option 4']
            elif dataset == 'siqa':
                target_names = ['Answer A', 'Answer B', 'Answer C']
            elif dataset == 'anli':
                target_names = ['Hypothesis 1', 'Hypothesis 2']
            
            epoch_loss_train, labels, preds = self.train(train_loader)
            print("Train eval")
            print(classification_report(labels, preds, target_names=target_names))

            epoch_loss_valid, labels, preds = self.evaluate(valid_loader)
            print("Valid eval")
            print(classification_report(labels, preds, target_names=target_names))


            if epoch_loss_valid < best_valid:
                best_valid = epoch_loss_valid
                torch.save(self.get_model_dict(), os.path.join(cur_dir, out_dir, save_location, f'model-mc-checkpoint-epoch{i+1}.pt'))
            
            losses_train.append(epoch_loss_train.tolist())
            losses_valid.append(epoch_loss_valid.tolist())
            print(f"Epoch {i}")
            print(f"Train loss: {epoch_loss_train}")
            print(f"Valid loss: {epoch_loss_valid}")

        train_epoch_idx = range(len(losses_train))
        valid_epoch_idx = range(len(losses_valid))
        # sns.lineplot(epoch_idx, all_losses)
        sns.lineplot(train_epoch_idx, losses_train)
        sns.lineplot(valid_epoch_idx, losses_valid)
        plt.show()

# Phase 2

## Experiment 1: Training on HellaSwag first

Setting up model hyperparameters

In [None]:
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 1e-5
NUM_EPOCHS = 10

# This is the base roberta class that will be pretrained on various different tasks
roberta_base = RobertaModel.from_pretrained('roberta-base')

mc_model_hs = Multiple_Choice_Model(roberta_base)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 'W' stands for 'Weight Decay fix"
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(mc_model_hs.parameters(), lr=ADAM_LEARNING_RATE)

Create Dataset Objects


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset_train_hs = HellaSwagDataset(tokenizer, train_data_hs, train_labels_hs)
dataset_dev_hs = HellaSwagDataset(tokenizer, dev_data_hs, dev_labels_hs)

In [None]:
train_loader_hs = DataLoader(dataset_train_hs, batch_size=TRAIN_BATCH_SIZE, shuffle=True, collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))
val_loader_hs = DataLoader(dataset_dev_hs, batch_size=1, shuffle=False)

In [None]:
trainer_hs = Trainer(mc_model_hs, optimizer, device)
trainer_hs.run_training(train_loader_hs, val_loader_hs, save_location='roberta-pretrain-hellaswag', dataset='hs', n_epochs=NUM_EPOCHS)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
                 precision    recall  f1-score   support

Ending Option 1       0.46      0.46      0.46      9977
Ending Option 2       0.46      0.46      0.46      9944
Ending Option 3       0.47      0.46      0.46      9986
Ending Option 4       0.47      0.46      0.47      9998

       accuracy                           0.46     39905
      macro avg       0.46      0.46      0.46     39905
   weighted avg       0.46      0.46      0.46     39905



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
                 precision    recall  f1-score   support

Ending Option 1       0.57      0.54      0.55      2570
Ending Option 2       0.55      0.56      0.56      2513
Ending Option 3       0.55      0.56      0.55      2482
Ending Option 4       0.56      0.56      0.56      2477

       accuracy                           0.56     10042
      macro avg       0.56      0.56      0.56     10042
   weighted avg       0.56      0.56      0.56     10042

Epoch 0
Train loss: 2.361659526824951
Valid loss: 1.412549376487732


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
                 precision    recall  f1-score   support

Ending Option 1       0.63      0.63      0.63      9977
Ending Option 2       0.63      0.63      0.63      9944
Ending Option 3       0.63      0.63      0.63      9986
Ending Option 4       0.64      0.64      0.64      9998

       accuracy                           0.63     39905
      macro avg       0.63      0.63      0.63     39905
   weighted avg       0.63      0.63      0.63     39905



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
                 precision    recall  f1-score   support

Ending Option 1       0.59      0.58      0.59      2570
Ending Option 2       0.59      0.60      0.59      2513
Ending Option 3       0.59      0.58      0.58      2482
Ending Option 4       0.58      0.59      0.58      2477

       accuracy                           0.59     10042
      macro avg       0.59      0.59      0.59     10042
   weighted avg       0.59      0.59      0.59     10042

Epoch 1
Train loss: 0.8091650605201721
Valid loss: 0.22426730394363403


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
                 precision    recall  f1-score   support

Ending Option 1       0.75      0.75      0.75      9977
Ending Option 2       0.75      0.75      0.75      9944
Ending Option 3       0.75      0.75      0.75      9986
Ending Option 4       0.76      0.75      0.75      9998

       accuracy                           0.75     39905
      macro avg       0.75      0.75      0.75     39905
   weighted avg       0.75      0.75      0.75     39905



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
                 precision    recall  f1-score   support

Ending Option 1       0.60      0.59      0.60      2570
Ending Option 2       0.60      0.60      0.60      2513
Ending Option 3       0.59      0.59      0.59      2482
Ending Option 4       0.58      0.60      0.59      2477

       accuracy                           0.59     10042
      macro avg       0.59      0.59      0.59     10042
   weighted avg       0.59      0.59      0.59     10042

Epoch 2
Train loss: 1.8184270858764648
Valid loss: 0.9447184205055237


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: ignored

Debugging: if you want to save the model trained after last epoch instead just epoch with best valid loss

In [None]:
roberta_base = RobertaModel.from_pretrained('roberta-base') # placeholder
init_model = Multiple_Choice_Model(roberta_base)
file_name = 'model-mc-checkpoint-.pt'
torch.save(mc_model.state_dict(), os.path.join(cur_dir, out_dir, "kaleen", file_name))

## Experiment 2: Training on aNLI first

In [None]:
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 1e-5
NUM_EPOCHS = 10

# This is the base roberta class that will be pretrained on various different tasks
roberta_base = RobertaModel.from_pretrained('roberta-base')

mc_model_anli = Multiple_Choice_Model(roberta_base)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 'W' stands for 'Weight Decay fix"
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(mc_model_anli.parameters(), lr=ADAM_LEARNING_RATE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_loader_anli = DataLoader(aNliDataset(tokenizer, train_data_anli, train_labels_anli), 
                             batch_size=TRAIN_BATCH_SIZE, 
                             shuffle=True, 
                             collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))

val_loader_anli = DataLoader(aNliDataset(tokenizer, dev_data_anli, dev_labels_anli), 
                           batch_size=1, 
                           shuffle=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [None]:
trainer_anli = Trainer(mc_model_anli, optimizer, device)
trainer_anli.run_training(train_loader_anli, val_loader_anli, save_location='roberta-pretrain-anli', dataset='anli', n_epochs=NUM_EPOCHS)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

Hypothesis 1       0.90      0.90      0.90     84832
Hypothesis 2       0.90      0.90      0.90     84822

    accuracy                           0.90    169654
   macro avg       0.90      0.90      0.90    169654
weighted avg       0.90      0.90      0.90    169654



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

Hypothesis 1       0.71      0.69      0.70       781
Hypothesis 2       0.69      0.71      0.70       751

    accuracy                           0.70      1532
   macro avg       0.70      0.70      0.70      1532
weighted avg       0.70      0.70      0.70      1532

Epoch 0
Train loss: 0.5005592107772827
Valid loss: 1.2574968338012695


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

Hypothesis 1       0.94      0.95      0.94     84832
Hypothesis 2       0.95      0.94      0.94     84822

    accuracy                           0.94    169654
   macro avg       0.94      0.94      0.94    169654
weighted avg       0.94      0.94      0.94    169654



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

Hypothesis 1       0.72      0.70      0.71       781
Hypothesis 2       0.70      0.72      0.71       751

    accuracy                           0.71      1532
   macro avg       0.71      0.71      0.71      1532
weighted avg       0.71      0.71      0.71      1532

Epoch 1
Train loss: 0.07442649453878403
Valid loss: 2.3839452266693115


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

Hypothesis 1       0.97      0.96      0.97     84832
Hypothesis 2       0.96      0.97      0.97     84822

    accuracy                           0.97    169654
   macro avg       0.97      0.97      0.97    169654
weighted avg       0.97      0.97      0.97    169654



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

Hypothesis 1       0.72      0.70      0.71       781
Hypothesis 2       0.70      0.72      0.71       751

    accuracy                           0.71      1532
   macro avg       0.71      0.71      0.71      1532
weighted avg       0.71      0.71      0.71      1532

Epoch 2
Train loss: 0.0529906339943409
Valid loss: 6.916586399078369


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

##Experiment 3: Training HellaSwag after aNLI (aNLI-->HellaSwag) (start here!)

In [None]:
#*********************RESTORE MODEL TRAINED ON aNLI************************
mc_model_anli_restored_state_dict = torch.load(os.path.join(cur_dir, out_dir, "roberta-pretrain-anli", 'model-mc-checkpoint-epoch1.pt'), )

roberta_base = RobertaModel.from_pretrained('roberta-base') # placeholder
mc_model_anli_restored = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
mc_model_anli_restored.load_state_dict(mc_model_anli_restored_state_dict)


#********************SET HYPERPARAMTERS*********************************
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 1e-5
NUM_EPOCHS = 10
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
OPTIMIZER = AdamW(mc_model_anli_restored.parameters(), lr=ADAM_LEARNING_RATE)


#********************DATA*******************************
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_loader_hs = DataLoader(HellaSwagDataset(tokenizer, train_data_hs, train_labels_hs), 
                             batch_size=TRAIN_BATCH_SIZE, 
                             shuffle=True, 
                             collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))

val_loader_hs = DataLoader(HellaSwagDataset(tokenizer, dev_data_hs, dev_labels_hs), 
                           batch_size=1, 
                           shuffle=False)


#********************SET UP AND RUN TRAINING*******************************
trainer_anli_hs = Trainer(mc_model_anli_restored, OPTIMIZER, DEVICE)
trainer_anli_hs.run_training(train_loader_hs, val_loader_hs, save_location='roberta-pretrain-hellaswag-after-anli', dataset='hs', n_epochs=NUM_EPOCHS)

# Phase 3: Fine-tuning on SocialIQA

Restore model trained on HellaSwag

In [None]:
mc_model_hs_restored_state_dict = torch.load(os.path.join(cur_dir, out_dir, "roberta-pretrain-hellaswag", 'model-mc-checkpoint-epoch2.pt'), )

roberta_base = RobertaModel.from_pretrained('roberta-base') # placeholder
mc_model_hs_restored = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
mc_model_hs_restored.load_state_dict(mc_model_hs_restored_state_dict)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [None]:
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 1e-5
NUM_EPOCHS = 10

# hyperparameters
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 'W' stands for 'Weight Decay fix"
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(mc_model_hs_restored.parameters(), lr=ADAM_LEARNING_RATE)

Create dataset objects

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

dataset_train_siqa = SocialiqaDataset(tokenizer, train_data_siqa, train_labels_siqa)
dataset_dev_siqa = SocialiqaDataset(tokenizer, dev_data_siqa, dev_labels_siqa)

train_loader_siqa = DataLoader(dataset_train_siqa, batch_size=TRAIN_BATCH_SIZE, shuffle=True, collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))
val_loader_siqa = DataLoader(dataset_dev_siqa, batch_size=1, shuffle=False)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [None]:
trainer_socialiqa = Trainer(mc_model_hs_restored, optimizer, device)
trainer_socialiqa.run_training(train_loader_siqa, val_loader_siqa, save_location='roberta-finetune-socialiqa-after-hs', dataset='siqa', n_epochs=NUM_EPOCHS)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.73      0.72      0.73     11274
    Answer B       0.73      0.72      0.72     11176
    Answer C       0.72      0.73      0.73     10960

    accuracy                           0.72     33410
   macro avg       0.72      0.72      0.72     33410
weighted avg       0.72      0.72      0.72     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.69      0.69      0.69       643
    Answer B       0.68      0.67      0.68       654
    Answer C       0.70      0.71      0.70       657

    accuracy                           0.69      1954
   macro avg       0.69      0.69      0.69      1954
weighted avg       0.69      0.69      0.69      1954

Epoch 0
Train loss: 0.9516472816467285
Valid loss: 0.02320549637079239


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.81      0.80      0.80     11274
    Answer B       0.80      0.80      0.80     11176
    Answer C       0.79      0.81      0.80     10960

    accuracy                           0.80     33410
   macro avg       0.80      0.80      0.80     33410
weighted avg       0.80      0.80      0.80     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.70      0.70      0.70       643
    Answer B       0.70      0.69      0.69       654
    Answer C       0.71      0.71      0.71       657

    accuracy                           0.70      1954
   macro avg       0.70      0.70      0.70      1954
weighted avg       0.70      0.70      0.70      1954

Epoch 1
Train loss: 0.9729134440422058
Valid loss: 0.012159978039562702


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.86      0.86      0.86     11274
    Answer B       0.86      0.85      0.86     11176
    Answer C       0.84      0.86      0.85     10960

    accuracy                           0.86     33410
   macro avg       0.86      0.86      0.86     33410
weighted avg       0.86      0.86      0.86     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.70      0.69      0.69       643
    Answer B       0.70      0.68      0.69       654
    Answer C       0.68      0.72      0.70       657

    accuracy                           0.69      1954
   macro avg       0.70      0.69      0.69      1954
weighted avg       0.70      0.69      0.69      1954

Epoch 2
Train loss: 0.00817684642970562
Valid loss: 0.0019859608728438616


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.90      0.90      0.90     11274
    Answer B       0.91      0.90      0.90     11176
    Answer C       0.90      0.91      0.90     10960

    accuracy                           0.90     33410
   macro avg       0.90      0.90      0.90     33410
weighted avg       0.90      0.90      0.90     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.70      0.72      0.71       643
    Answer B       0.71      0.69      0.70       654
    Answer C       0.72      0.72      0.72       657

    accuracy                           0.71      1954
   macro avg       0.71      0.71      0.71      1954
weighted avg       0.71      0.71      0.71      1954

Epoch 3
Train loss: 0.1450457125902176
Valid loss: 0.001311871805228293


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.93      0.93      0.93     11274
    Answer B       0.93      0.93      0.93     11176
    Answer C       0.93      0.93      0.93     10960

    accuracy                           0.93     33410
   macro avg       0.93      0.93      0.93     33410
weighted avg       0.93      0.93      0.93     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.69      0.71      0.70       643
    Answer B       0.70      0.67      0.69       654
    Answer C       0.69      0.72      0.70       657

    accuracy                           0.70      1954
   macro avg       0.70      0.70      0.70      1954
weighted avg       0.70      0.70      0.70      1954

Epoch 4
Train loss: 0.34976643323898315
Valid loss: 1.645074735279195e-05


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.95      0.95      0.95     11274
    Answer B       0.95      0.95      0.95     11176
    Answer C       0.95      0.95      0.95     10960

    accuracy                           0.95     33410
   macro avg       0.95      0.95      0.95     33410
weighted avg       0.95      0.95      0.95     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.70      0.71      0.70       643
    Answer B       0.71      0.70      0.70       654
    Answer C       0.70      0.70      0.70       657

    accuracy                           0.70      1954
   macro avg       0.70      0.70      0.70      1954
weighted avg       0.70      0.70      0.70      1954

Epoch 5
Train loss: 0.031468380242586136
Valid loss: 2.861018856492592e-06


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.96      0.96      0.96     11274
    Answer B       0.96      0.96      0.96     11176
    Answer C       0.96      0.96      0.96     10960

    accuracy                           0.96     33410
   macro avg       0.96      0.96      0.96     33410
weighted avg       0.96      0.96      0.96     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.69      0.71      0.70       643
    Answer B       0.70      0.67      0.68       654
    Answer C       0.69      0.69      0.69       657

    accuracy                           0.69      1954
   macro avg       0.69      0.69      0.69      1954
weighted avg       0.69      0.69      0.69      1954

Epoch 6
Train loss: 2.9802316703353426e-07
Valid loss: 0.00012861855793744326


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: ignored

# Evaluation

Model just trained on SocialIQA

In [None]:
mc_model_restored_state_dict = torch.load(os.path.join(cur_dir, out_dir, "roberta-base-socialiqa", 'model-mc-checkpoint-epoch2.pt')) )

roberta_base = RobertaModel.from_pretrained('roberta-base')
mc_model_restored = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
mc_model_restored.load_state_dict(mc_model_restored_state_dict)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initializing with hidden size 768


<All keys matched successfully>

Model trained on HellaSwag, then SocialIQA

In [None]:
hs_siqa_mc_model_restored_state_dict = torch.load(os.path.join(cur_dir, out_dir, "roberta-finetune-socialiqa-after-hs", 'model-mc-checkpoint-epoch6.pt'))

roberta_base = RobertaModel.from_pretrained('roberta-base') #placeholder
hs_siqa_mc_model_restored = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
hs_siqa_mc_model_restored.load_state_dict(hs_siqa_mc_model_restored_state_dict)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

# Checking against baseline (pretrained Roberta for MC)

The baseline is HuggingFace's pretrained RoBERTa for Multiple Choice. We check
performance on validation set of SocialIQA. Performance in this case is for multiclass classification shown in the classification report, consisting of precision, recall, F1 score, and accuracy. the validation set has a pretty balanced distribution for choice A, B, and C examples.

Performance comparison (macro average) P, R, F1:

*   Baseline: 0.36, 0.36, 0.36, **0.36**
*   Pretrained on socialiqa (2 epochs): 0.70, 0.70, 0.70, **0.70**
*   Pretrained on hellaswag, then socialiqa:  0.70, 0.70, 0.70, **0.70**





In [None]:
from sklearn.metrics import classification_report
from transformers import RobertaForMultipleChoice

In [None]:
tokenizer2 = RobertaTokenizer.from_pretrained('roberta-base')
model2 = RobertaForMultipleChoice.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultipleChoice: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

In [None]:
preds = []
for point, label in zip(dev_data, dev_labels):
    input_context_question = [point[0] + tokenizer2.sep_token + tokenizer2.sep_token + point[1], point[0] + tokenizer2.sep_token + tokenizer2.sep_token + point[1], point[0] + tokenizer2.sep_token + tokenizer2.sep_token + point[1]]
    input_answers = [point[2], point[3], point[4]]
    encoding = tokenizer2(input_context_question, input_answers, return_tensors='pt', padding=True)
    labels = torch.tensor(label).unsqueeze(0)
    outputs = model2(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels)  # batch size is 1
    preds.append(torch.argmax(outputs.logits, dim=1).tolist()[0])

In [None]:
target_names = ['Answer A', 'Answer B', 'Answer C']
print("Not trained on socialiqa model evaluation")
print(classification_report(dev_labels, preds, target_names=target_names))

Not trained on socialiqa model evaluation
              precision    recall  f1-score   support

    Answer A       0.34      0.35      0.35       643
    Answer B       0.36      0.36      0.36       654
    Answer C       0.36      0.35      0.36       657

    accuracy                           0.36      1954
   macro avg       0.36      0.36      0.36      1954
weighted avg       0.36      0.36      0.36      1954



In [None]:
optimizer = AdamW(mc_model_restored.parameters(), lr=1e-5)

trainer_socialiqa = Trainer(mc_model_restored, optimizer, device)
epoch_loss_valid, labels, preds = trainer_socialiqa.evaluate(val_loader)
print("trained on socialiqa for two epochs evaluation")
print(classification_report(labels, preds, target_names=target_names))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


trained on socialiqa for two epochs evaluation
              precision    recall  f1-score   support

    Answer A       0.71      0.71      0.71       643
    Answer B       0.70      0.68      0.69       654
    Answer C       0.70      0.72      0.71       657

    accuracy                           0.70      1954
   macro avg       0.70      0.70      0.70      1954
weighted avg       0.70      0.70      0.70      1954



In [None]:
from sklearn.metrics import classification_report
target_names = ['Answer A', 'Answer B', 'Answer C']

optimizer = AdamW(hs_siqa_mc_model_restored.parameters(), lr=1e-5)

trainer_socialiqa = Trainer(hs_siqa_mc_model_restored, optimizer, device)
epoch_loss_valid, labels, preds = trainer_socialiqa.evaluate(val_loader_siqa)
print("trained on hellaswag, then socialiqa evaluation")
print(classification_report(labels, preds, target_names=target_names))


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


trained on hellaswag, then socialiqa evaluation
              precision    recall  f1-score   support

    Answer A       0.70      0.71      0.70       643
    Answer B       0.71      0.70      0.70       654
    Answer C       0.70      0.70      0.70       657

    accuracy                           0.70      1954
   macro avg       0.70      0.70      0.70      1954
weighted avg       0.70      0.70      0.70      1954

