# Top

In [2]:
#!pip install transformers
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
import os
import pandas as pd

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

from transformers import RobertaModel, RobertaTokenizer, AdamW

from tqdm.auto import tqdm

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns

import random
import numpy as np

cur_dir = "/data/users/kashrest/socialiqa-nlp244" #"/content/drive/MyDrive/Colab_Notebooks/NLP_244_Advanced_ML/final_project_socialiqa/socialiqa-nlp244"
data_dir_siqa = "socialiqa-train-dev"
data_dir_hs = "hellaswag-train-dev"
data_dir_anli = "alphanli-train-dev"

out_dir = "out"

np_seed = np.random.seed(27)
torch.manual_seed(27)
random.seed(27)

# Data

## Extract aNLI data

Getting data and corresponding labels from dev and train split


In [4]:
file_train_anli = os.path.join(cur_dir, data_dir_anli, "train.jsonl")
file_dev_anli = os.path.join(cur_dir, data_dir_anli, "dev.jsonl")

json_train_anli = pd.read_json(file_train_anli, lines=True)
json_dev_anli = pd.read_json(file_dev_anli, lines=True)

# list of tuples (obs1, obs2, hyp1, hyp2)
train_data_anli = [elem for elem in zip(json_train_anli['obs1'].tolist(), 
                                   json_train_anli['obs2'].tolist(), 
                                   json_train_anli['hyp1'].tolist(), 
                                   json_train_anli['hyp2'].tolist())]

dev_data_anli = [elem for elem in zip(json_dev_anli['obs1'].tolist(), 
                                   json_dev_anli['obs2'].tolist(), 
                                   json_dev_anli['hyp1'].tolist(), 
                                   json_dev_anli['hyp2'].tolist())]

print(len(train_data_anli), len(dev_data_anli))

# Labels is a list of integers either 0, 1
train_labels_anli = []
dev_labels_anli = []
with open(os.path.join(cur_dir, data_dir_anli, "train-labels.lst")) as f:
    for line in f:
      train_labels_anli.append(int(line.split()[0]))

with open(os.path.join(cur_dir, data_dir_anli, "dev-labels.lst")) as f:
    for line in f:
      dev_labels_anli.append(int(line.split()[0]))

train_labels_anli = [label-1 for label in train_labels_anli]
dev_labels_anli = [label-1 for label in dev_labels_anli]

print(len(train_labels_anli), len(dev_labels_anli))

169654 1532
169654 1532


In [5]:
class aNliDataset(Dataset):
    """
    This dataset class is for preparing data for aNLI.

    This is the context/question + multiple choice format, and each example
    consists of num choices lists of encoded strings. Note that the input will
    be encoded and padded in this stage. prepare_batch will take care of padding 
    across examples in the batch-level. 
    """
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (obs1, obs2, hyp1, hyp2)
        # y: list of indices of the correct answer
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        point = self.x[idx]
        # format for aNLI is: obs1 hyp1 obs 2, obs1 hyp2 obs2
        input_context_question = [point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[2], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[3]]
        input_answers = [point[1], point[1]]
        encoded_text_train = self.roberta_tokenizer(input_context_question, input_answers, return_tensors='pt', padding=True)
        return (encoded_text_train, self.y[idx])

    def __len__(self):
        return len(self.x)


## Extract HellaSwag data

Getting data and corresponding labels from dev and train split

In [6]:
file_train_hs = os.path.join(cur_dir, data_dir_hs, "train.jsonl")
file_dev_hs = os.path.join(cur_dir, data_dir_hs, "valid.jsonl")

json_train_hs = pd.read_json(path_or_buf=file_train_hs, lines=True)
json_dev_hs = pd.read_json(path_or_buf=file_dev_hs, lines=True)

# list of tuples (context, 
#                [ending option 1, 
#                 ending option 2, 
#                 ending option 3, 
#                 ending option 4])
train_data_hs = [elem for elem in zip(json_train_hs['ctx'].tolist(), 
                                   json_train_hs['ending_options'].tolist())]

dev_data_hs = [elem for elem in zip(json_dev_hs['ctx'].tolist(), 
                                   json_dev_hs['ending_options'].tolist())]

print(len(train_data_hs), len(dev_data_hs))

# Labels is a list of integers either 0, 1, 2, 3
train_labels_hs = []
dev_labels_hs = []
with open(os.path.join(cur_dir, data_dir_hs, "train-labels.lst")) as f:
    for line in f:
      train_labels_hs.append(int(line.split()[0]))

with open(os.path.join(cur_dir, data_dir_hs, "valid-labels.lst")) as f:
    for line in f:
      dev_labels_hs.append(int(line.split()[0]))

train_labels_hs = [label for label in train_labels_hs]
dev_labels_hs = [label for label in dev_labels_hs]

print(len(train_labels_hs), len(dev_labels_hs))

39905 10042
39905 10042


In [7]:
class HellaSwagDataset(Dataset):
    """
    This dataset class is for preparing data for HellaSwag.

    This is the context + multiple choice format, and each example
    consists of num choices lists of encoded strings. Note that the input will
    be encoded and padded in this stage. prepare_batch will take care of padding 
    across examples in the batch-level. 
    """
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (context, 
        #                               [ending option 1, 
        #                                ending option 2, 
        #                                ending option 3, 
        #                                ending option 4])
        # y: list of indices of the correct answer 0, 1, 2, or 3
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        point = self.x[idx]
        input_context = [point[0], point[0], point[0], point[0]]
        input_answers = [point[1][0], point[1][1], point[1][2], point[1][3]]
        encoded_text_train = self.roberta_tokenizer(input_context, input_answers, return_tensors='pt', padding=True)
        return (encoded_text_train, self.y[idx])

    def __len__(self):
        return len(self.x)


## Extract SocialIQA data

Getting data and corresponding labels from dev and train split

In [8]:
file_train_siqa = os.path.join(cur_dir, data_dir_siqa, "train.jsonl")
file_dev_siqa = os.path.join(cur_dir, data_dir_siqa, "dev.jsonl")

json_train = pd.read_json(path_or_buf=file_train_siqa, lines=True)
json_dev = pd.read_json(path_or_buf=file_dev_siqa, lines=True)

# Data is a list of tuples (context, question, A, B, C)
train_data_siqa = [elem for elem in zip(json_train['context'].tolist(), 
                                   json_train['question'].tolist(), 
                                   json_train['answerA'].tolist(), 
                                   json_train['answerB'].tolist(), 
                                   json_train['answerC'].tolist())]

dev_data_siqa = [elem for elem in zip(json_dev['context'].tolist(), 
                                   json_dev['question'].tolist(), 
                                   json_dev['answerA'].tolist(), 
                                   json_dev['answerB'].tolist(), 
                                   json_dev['answerC'].tolist())]

print(len(train_data_siqa), len(dev_data_siqa))

# Labels is a list of integers either 0, 1, 2
train_labels_siqa = []
dev_labels_siqa = []
with open(os.path.join(cur_dir, data_dir_siqa, "train-labels.lst")) as f:
    for line in f:
      train_labels_siqa.append(int(line.split()[0]))

with open(os.path.join(cur_dir, data_dir_siqa, "dev-labels.lst")) as f:
    for line in f:
      dev_labels_siqa.append(int(line.split()[0]))

train_labels_siqa = [label-1 for label in train_labels_siqa]
dev_labels_siqa = [label-1 for label in dev_labels_siqa]

print(len(train_labels_siqa), len(dev_labels_siqa))

33410 1954
33410 1954


In [9]:
class SocialiqaDataset(Dataset):
    """
    This dataset class is for preparing data for socialiqa.

    This is the context/question + multiple choice format, and each example
    consists of num choices lists of encoded strings. Note that the input will
    be encoded and padded in this stage. prepare_batch will take care of padding 
    across examples in the batch-level. 
    """
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (context, question, answer1, answer2, answer3)
        # y: list of indices of the correct answer
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        point = self.x[idx]
        input_context_question = [point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1]]
        input_answers = [point[2], point[3], point[4]]
        encoded_text_train = self.roberta_tokenizer(input_context_question, input_answers, return_tensors='pt', padding=True)
        return (encoded_text_train, self.y[idx])

    def __len__(self):
        return len(self.x)


## Prepare batch MC (all)

In [10]:
def prepare_batch_MC(batch, tokenizer):
    """
    This collate function will pad the batch to be the same length. This requires
    flattening, then unflattening for the multiple choice format.
    One example will be a list of length 'num choices', each element being a list
    of (encoded) tokens representing qustion/answer [sep] choicex
    """
    # batch: [batch_size, (text, label)]
    batch_size = len(batch)

    features, labels = zip(*batch)
    # features: tuple of length batch_size, 
    #        each element is a dict with keys = ["input_ids", "attention_mask"]
    # labels: tuple of int indicies length batch_size
    num_choices = len(features[0]["input_ids"])
    
    # flatten
    flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
    flattened_features = sum(flattened_features, [])
    # flattened_features list length num_choices*batch_size

    batch = tokenizer.pad(
            flattened_features,
            padding=True,
            return_tensors="pt",
        )
    
    # Un-flatten
    batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
    return (batch, torch.tensor(labels, dtype=torch.int64))


# Model Class and Training Class

In [11]:
class Multiple_Choice_Model(nn.Module):
    def __init__(self, roberta_model: RobertaModel, dropout: float = None):
          super(Multiple_Choice_Model, self).__init__()
          self.roberta = roberta_model
          self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob)
          self.classifier = nn.Linear(self.roberta.config.hidden_size, 1)
   
    def forward(self, input_ids: torch.tensor, attention_mask: torch.tensor, labels=None):
          num_choices = input_ids.shape[1] 
          flat_input_ids = input_ids.view(-1, input_ids.size(-1))
          flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))

          outputs = self.roberta(
              input_ids = flat_input_ids,
              attention_mask=flat_attention_mask,
          )
          pooled_output = outputs[1] 

          pooled_output = self.dropout(pooled_output)
          logits = self.classifier(pooled_output)
          reshaped_logits = logits.view(-1, num_choices)

          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(reshaped_logits, labels)

          return loss, reshaped_logits

In [12]:
from sklearn.metrics import classification_report

class Trainer(object):
    """
    Trainer for training a multiple choice classification model
    """

    def __init__(self, model, optimizer, device="cpu"):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.device = device

    def _print_summary(self):
        print(self.model)
        print(self.optimizer)

    def train(self, loader):
        """
        Run a single epoch of training
        """

        self.model.train() # Run model in training mode
        loss = None

        epoch_true_labels = []
        epoch_preds = []
        for i, batch in tqdm(enumerate(loader)):
            # clear gradient
            self.optimizer.zero_grad() 
            # input_ids shape: (batch_size, num_choices, sequence_length)
            input_ids = batch[0]['input_ids'].to(self.device)
            # input_ids shape: (batch_size, num_choices, sequence_length)
            attention_mask = batch[0]['attention_mask'].to(self.device)
            # labels shape: (batch_size, )
            labels = batch[1].to(self.device)

            outputs = self.model(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels=labels)
            loss, logits = outputs[0], outputs[1]
            
            epoch_true_labels.extend(labels.tolist())
            epoch_preds.extend(torch.argmax(nn.Softmax(dim=1)(logits), dim=1).tolist())
            
            # back propagation
            loss.backward()
            # do gradient descent
            self.optimizer.step() 

        # Just returning the last loss
        return loss, epoch_true_labels, epoch_preds

    def evaluate(self, loader):
        """
        Evaluate the model on a validation set.
        Only do batch size = 1.
        """

        self.model.eval() # Run model in eval mode (disables dropout layer)
        loss = None

        epoch_true_labels = []
        epoch_preds = []
        with torch.no_grad(): # Disable gradient computation - required only during training
            for i, batch in tqdm(enumerate(loader)):
                # input_ids shape: (batch_size, num_choices, sequence_length)
                input_ids = batch[0]['input_ids'].to(self.device)
                # input_ids shape: (batch_size, num_choices, sequence_length)
                attention_mask = batch[0]['attention_mask'].to(self.device)
                # labels shape: (batch_size, )
                labels = batch[1].to(self.device)

                outputs = self.model(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                labels=labels)
                loss, logits = outputs[0], outputs[1]
                
                epoch_true_labels.extend(labels.tolist())
                epoch_preds.extend(torch.argmax(nn.Softmax(dim=1)(logits), dim=1).tolist())
            
        # Just returning the last loss
        return loss, epoch_true_labels, epoch_preds

    def get_model_dict(self):
        return self.model.state_dict()

    def run_training(self, train_loader, valid_loader, save_location, dataset, n_epochs=3):
        # Useful for us to review what experiment we're running
        # Normally, you'd want to save this to a file
        # self._print_summary()
        losses_valid = []
        losses_train = []
        best_valid = float("inf")
        for i in range(n_epochs):
            target_names = None
            if dataset == 'hs':
                target_names = ['Ending Option 1', 'Ending Option 2', 'Ending Option 3', 'Ending Option 4']
            elif dataset == 'siqa':
                target_names = ['Answer A', 'Answer B', 'Answer C']
            elif dataset == 'anli':
                target_names = ['Hypothesis 1', 'Hypothesis 2']
            
            epoch_loss_train, labels, preds = self.train(train_loader)
            print("Train eval")
            print(classification_report(labels, preds, target_names=target_names))

            epoch_loss_valid, labels, preds = self.evaluate(valid_loader)
            print("Valid eval")
            print(classification_report(labels, preds, target_names=target_names))


            if epoch_loss_valid < best_valid:
                best_valid = epoch_loss_valid
                #torch.save(self.get_model_dict(), os.path.join(cur_dir, out_dir, save_location, f'model-mc-checkpoint-epoch{i+1}.pt'))
            
            losses_train.append(epoch_loss_train.tolist())
            losses_valid.append(epoch_loss_valid.tolist())
            print(f"Epoch {i}")
            print(f"Train loss: {epoch_loss_train}")
            print(f"Valid loss: {epoch_loss_valid}")

        train_epoch_idx = range(len(losses_train))
        valid_epoch_idx = range(len(losses_valid))
        # sns.lineplot(epoch_idx, all_losses)
        sns.lineplot(train_epoch_idx, losses_train)
        sns.lineplot(valid_epoch_idx, losses_valid)
        plt.show()

# Phase 2

## Experiment 1: Training on HellaSwag first

Setting up model hyperparameters

In [None]:
#********************SET HYPERPARAMTERS*********************************
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 5e-6
NUM_EPOCHS = 10

# This is the base roberta class that will be pretrained on various different tasks
roberta_base = RobertaModel.from_pretrained('roberta-base')
mc_model_hs = Multiple_Choice_Model(roberta_base)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
OPTIMIZER = AdamW(mc_model_hs.parameters(), lr=ADAM_LEARNING_RATE)

#********************DATA*******************************
train_loader_hs = DataLoader(HellaSwagDataset(tokenizer, train_data_hs, train_labels_hs), 
                             batch_size=TRAIN_BATCH_SIZE, 
                             shuffle=True, 
                             collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))
val_loader_hs = DataLoader(HellaSwagDataset(tokenizer, dev_data_hs, dev_labels_hs), 
                           batch_size=1, 
                           shuffle=False)

#********************SET UP AND RUN TRAINING*******************************
trainer_hs = Trainer(mc_model_hs, OPTIMIZER, DEVICE)
trainer_hs.run_training(train_loader_hs, val_loader_hs, save_location='roberta-pretrain-hellaswag-batch-size-8-lr-5e-6', dataset='hs', n_epochs=NUM_EPOCHS)

## Experiment 2: Training on aNLI first

In [None]:
#********************SET HYPERPARAMTERS*********************************
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 5e-6
NUM_EPOCHS = 10

# This is the base roberta class that will be pretrained on various different tasks
roberta_base = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
mc_model_anli = Multiple_Choice_Model(roberta_base)

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
OPTIMIZER = AdamW(mc_model_anli.parameters(), lr=ADAM_LEARNING_RATE)


#********************DATA*******************************
train_loader_anli = DataLoader(aNliDataset(tokenizer, train_data_anli, train_labels_anli), 
                             batch_size=TRAIN_BATCH_SIZE, 
                             shuffle=True, 
                             collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))

val_loader_anli = DataLoader(aNliDataset(tokenizer, dev_data_anli, dev_labels_anli), 
                           batch_size=1, 
                           shuffle=False)

#********************SET UP AND RUN TRAINING*******************************
trainer_anli = Trainer(mc_model_anli, OPTIMIZER, DEVICE)
trainer_anli.run_training(train_loader_anli, val_loader_anli, save_location='roberta-pretrain-anli', dataset='anli', n_epochs=NUM_EPOCHS)

## Experiment 3: Training HellaSwag after aNLI (aNLI-->HellaSwag)

In [None]:
#*********************RESTORE MODEL TRAINED ON aNLI************************
mc_model_anli_restored_state_dict = torch.load(os.path.join(cur_dir, out_dir, "roberta-pretrain-anli", 'model-mc-checkpoint-epoch1.pt'), )

roberta_base = RobertaModel.from_pretrained('roberta-base') # placeholder
mc_model_anli_restored = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
mc_model_anli_restored.load_state_dict(mc_model_anli_restored_state_dict)
print("****Model pretrained on aNLI, restored***")

#********************SET HYPERPARAMTERS*********************************
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 1e-5
NUM_EPOCHS = 10
DEVICE = torch.device('cuda:4') if torch.cuda.is_available() else torch.device('cpu')
OPTIMIZER = AdamW(mc_model_anli_restored.parameters(), lr=ADAM_LEARNING_RATE)
print("****Hyperprameters set****")

#********************DATA*******************************
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_loader_hs = DataLoader(HellaSwagDataset(tokenizer, train_data_hs, train_labels_hs), 
                             batch_size=TRAIN_BATCH_SIZE, 
                             shuffle=True, 
                             collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))

val_loader_hs = DataLoader(HellaSwagDataset(tokenizer, dev_data_hs, dev_labels_hs), 
                           batch_size=1, 
                           shuffle=False)
print("****Data, prepared****")

#********************SET UP AND RUN TRAINING*******************************
trainer_anli_hs = Trainer(mc_model_anli_restored, OPTIMIZER, DEVICE)
print("****Training****")
trainer_anli_hs.run_training(train_loader_hs, val_loader_hs, save_location='roberta-pretrain-hellaswag-after-anli-lr-5e-6', dataset='hs', n_epochs=NUM_EPOCHS)

## Experiment 4: Training aNLI after HellaSwag (HellaSwag-->aNLI)

In [None]:
#*********************RESTORE MODEL TRAINED ON aNLI************************
mc_model_hs_restored_state_dict = torch.load(os.path.join(cur_dir, out_dir, "roberta-pretrain-hellaswag-batch-size-8-lr-5e-6", 'model-mc-checkpoint-epoch5.pt'))

roberta_base = RobertaModel.from_pretrained('roberta-base') # placeholder
mc_model_hs_restored = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
mc_model_hs_restored.load_state_dict(mc_model_hs_restored_state_dict)
print("****Model pretrained on aNLI, restored***")

#********************SET HYPERPARAMTERS*********************************
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 5e-6
NUM_EPOCHS = 10
DEVICE = torch.device('cuda:3') if torch.cuda.is_available() else torch.device('cpu')
OPTIMIZER = AdamW(mc_model_hs_restored.parameters(), lr=ADAM_LEARNING_RATE)
print("****Hyperprameters set****")

#********************DATA*******************************
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_loader_anli = DataLoader(aNliDataset(tokenizer, train_data_anli, train_labels_anli), 
                             batch_size=TRAIN_BATCH_SIZE, 
                             shuffle=True, 
                             collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))

val_loader_anli = DataLoader(aNliDataset(tokenizer, dev_data_anli, dev_labels_anli), 
                           batch_size=1, 
                           shuffle=False)
print("****Data, prepared****")

#********************SET UP AND RUN TRAINING*******************************
trainer_hs_anli = Trainer(mc_model_hs_restored, OPTIMIZER, DEVICE)
print("****Training****")
trainer_hs_anli.run_training(train_loader_anli, val_loader_anli, save_location='roberta-pretrain-anli-after-hellaswag-lr-5e-6', dataset='anli', n_epochs=NUM_EPOCHS)

# Phase 3: Fine-tuning on SocialIQA

In [None]:
#*********************RESTORE MODEL TRAINED ON aNLI************************
mc_model_hs_anli_restored_state_dict = torch.load(os.path.join(cur_dir, out_dir, "roberta-pretrain-anli-after-hellaswag-lr-5e-6", 'model-mc-checkpoint-epoch10.pt'), )

roberta_base = RobertaModel.from_pretrained('roberta-base') # placeholder
mc_model_hs_anli_restored = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
mc_model_hs_anli_restored.load_state_dict(mc_model_hs_anli_restored_state_dict)

#********************SET HYPERPARAMTERS*********************************
TRAIN_BATCH_SIZE = 8
ADAM_LEARNING_RATE = 1e-5
NUM_EPOCHS = 10

DEVICE = torch.device('cuda:4') if torch.cuda.is_available() else torch.device('cpu')
OPTIMIZER = AdamW(mc_model_hs_anli_restored.parameters(), lr=ADAM_LEARNING_RATE)


#********************DATA*******************************
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_loader_siqa = DataLoader(SocialiqaDataset(tokenizer, train_data_siqa, train_labels_siqa), 
                               batch_size=TRAIN_BATCH_SIZE, 
                               shuffle=True, 
                               collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))
val_loader_siqa = DataLoader(SocialiqaDataset(tokenizer, dev_data_siqa, dev_labels_siqa), 
                             batch_size=1, 
                             shuffle=False)

#********************SET UP AND RUN TRAINING*******************************
trainer_socialiqa = Trainer(mc_model_hs_anli_restored, OPTIMIZER, DEVICE)
trainer_socialiqa.run_training(train_loader_siqa, val_loader_siqa, save_location='roberta-hellaswag-anli-socialiqa', dataset='siqa', n_epochs=NUM_EPOCHS)

# Evaluation

In [19]:
from sklearn.metrics import classification_report
from transformers import RobertaForMultipleChoice

DEVICE = torch.device('cuda:5') if torch.cuda.is_available() else torch.device('cpu')
TOKENIZER = RobertaTokenizer.from_pretrained('roberta-base')
val_loader_siqa = DataLoader(SocialiqaDataset(TOKENIZER, dev_data_siqa, dev_labels_siqa), 
                             batch_size=1, 
                             shuffle=False)

Model just trained on SocialIQA

In [15]:
restore_dict = torch.load(os.path.join(cur_dir, out_dir, "roberta-base-socialiqa", 'model-mc-checkpoint-epoch2.pt'), map_location=DEVICE)

roberta_base = RobertaModel.from_pretrained('roberta-base')
model1 = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
model1.load_state_dict(restore_dict)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

Model aNLI --> HellaSwag --> SocialIQA (lr = 1e-5)

In [16]:
folder = "roberta-anli-hellaswag-socialiqa-lr-1e-5"
checkpoint = 'model-mc-checkpoint-epoch5.pt'
restore_dict = torch.load(os.path.join(cur_dir, out_dir, folder, checkpoint))

roberta_base = RobertaModel.from_pretrained('roberta-base') #placeholder
model2 = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
model2.load_state_dict(restore_dict)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

Model aNLI --> HellaSwag --> SocialIQA (lr = 5e-6)


In [17]:
folder = "roberta-anli-hellaswag-socialiqa-lr-5e-6"
checkpoint = 'model-mc-checkpoint-epoch8.pt'
restore_dict = torch.load(os.path.join(cur_dir, out_dir, folder, checkpoint))

roberta_base = RobertaModel.from_pretrained('roberta-base') #placeholder
model3 = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
model3.load_state_dict(restore_dict)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

Model HellaSwag --> aNLI -->SocialIQA (lr = 1e-5)

In [18]:
folder = "roberta-hellaswag-anli-socialiqa"
checkpoint = 'model-mc-checkpoint-epoch10.pt'
restore_dict = torch.load(os.path.join(cur_dir, out_dir, folder, checkpoint))

roberta_base = RobertaModel.from_pretrained('roberta-base') #placeholder
model4 = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
model4.load_state_dict(restore_dict)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

# Checking against baseline (pretrained Roberta for MC)

The baseline is HuggingFace's pretrained RoBERTa for Multiple Choice. We check
performance on validation set of SocialIQA. Performance in this case is for multiclass classification shown in the classification report, consisting of precision, recall, F1 score, and accuracy. the validation set has a pretty balanced distribution for choice A, B, and C examples.

Performance comparison (macro average) P, R, F1:

*   Baseline: 0.36, 0.36, 0.36, **0.36**
*   Pretrained on socialiqa (2 epochs): 0.70, 0.70, 0.70, **0.70**
*   Pretrained on hellaswag, then socialiqa:  0.70, 0.70, 0.70, **0.70**





In [21]:
tokenizer2 = RobertaTokenizer.from_pretrained('roberta-base')
baseline_model = RobertaForMultipleChoice.from_pretrained('roberta-base')

preds = []
for point, label in zip(dev_data_siqa, dev_labels_siqa):
    input_context_question = [point[0] + tokenizer2.sep_token + tokenizer2.sep_token + point[1], point[0] + tokenizer2.sep_token + tokenizer2.sep_token + point[1], point[0] + tokenizer2.sep_token + tokenizer2.sep_token + point[1]]
    input_answers = [point[2], point[3], point[4]]
    encoding = tokenizer2(input_context_question, input_answers, return_tensors='pt', padding=True)
    labels = torch.tensor(label).unsqueeze(0)
    outputs = baseline_model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels)  # batch size is 1
    preds.append(torch.argmax(outputs.logits, dim=1).tolist()[0])
    
target_names = ['Answer A', 'Answer B', 'Answer C']
print("Not trained on socialiqa model evaluation")
print(classification_report(dev_labels_siqa, preds, target_names=target_names))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultipleChoice: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

Not trained on socialiqa model evaluation
              precision    recall  f1-score   support

    Answer A       0.32      0.33      0.32       643
    Answer B       0.32      0.32      0.32       654
    Answer C       0.35      0.35      0.35       657

    accuracy                           0.33      1954
   macro avg       0.33      0.33      0.33      1954
weighted avg       0.33      0.33      0.33      1954



# Model 1: Fine-tune on SocialIQA (lr = 1e-5)

In [22]:
optimizer = AdamW(model1.parameters(), lr=1e-5) # lr not used

trainer_socialiqa = Trainer(model1, optimizer, DEVICE)
epoch_loss_valid, labels, preds = trainer_socialiqa.evaluate(val_loader_siqa)
print(classification_report(labels, preds, target_names=target_names))

0it [00:00, ?it/s]

              precision    recall  f1-score   support

    Answer A       0.71      0.71      0.71       643
    Answer B       0.70      0.68      0.69       654
    Answer C       0.70      0.72      0.71       657

    accuracy                           0.70      1954
   macro avg       0.70      0.70      0.70      1954
weighted avg       0.70      0.70      0.70      1954



# Model 2: aNLI --> HellaSwag --> SocialIQA (lr = 1e-5)

In [23]:
optimizer = AdamW(model2.parameters(), lr=1e-5) # lr not used

trainer_socialiqa = Trainer(model2, optimizer, DEVICE)
epoch_loss_valid, labels, preds = trainer_socialiqa.evaluate(val_loader_siqa)
print(classification_report(labels, preds, target_names=target_names))

0it [00:00, ?it/s]

              precision    recall  f1-score   support

    Answer A       0.69      0.68      0.68       643
    Answer B       0.68      0.68      0.68       654
    Answer C       0.69      0.70      0.70       657

    accuracy                           0.69      1954
   macro avg       0.69      0.69      0.69      1954
weighted avg       0.69      0.69      0.69      1954



# Model 3:  aNLI --> HellaSwag --> SocialIQA (lr = 5e-6)


In [24]:
optimizer = AdamW(model3.parameters(), lr=1e-5) # lr not used

trainer_socialiqa = Trainer(model3, optimizer, DEVICE)
epoch_loss_valid, labels, preds = trainer_socialiqa.evaluate(val_loader_siqa)
print(classification_report(labels, preds, target_names=target_names))

0it [00:00, ?it/s]

              precision    recall  f1-score   support

    Answer A       0.69      0.67      0.68       643
    Answer B       0.69      0.68      0.69       654
    Answer C       0.69      0.72      0.70       657

    accuracy                           0.69      1954
   macro avg       0.69      0.69      0.69      1954
weighted avg       0.69      0.69      0.69      1954



# Model 4: HellaSwag --> aNLI -->SocialIQA (lr = 1e-5)

In [25]:
optimizer = AdamW(model4.parameters(), lr=1e-5) # lr not used

trainer_socialiqa = Trainer(model4, optimizer, DEVICE)
epoch_loss_valid, labels, preds = trainer_socialiqa.evaluate(val_loader_siqa)
print(classification_report(labels, preds, target_names=target_names))

0it [00:00, ?it/s]

              precision    recall  f1-score   support

    Answer A       0.71      0.70      0.70       643
    Answer B       0.70      0.69      0.70       654
    Answer C       0.69      0.71      0.70       657

    accuracy                           0.70      1954
   macro avg       0.70      0.70      0.70      1954
weighted avg       0.70      0.70      0.70      1954

