# Top

In [None]:
!pip install transformers
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import os
import pandas as pd

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

from transformers import RobertaModel, RobertaTokenizer, AdamW

from tqdm.auto import tqdm

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns

cur_dir = "/content/drive/MyDrive/Colab_Notebooks/NLP_244_Advanced_ML/final_project_socialiqa/socialiqa-nlp244"
data_dir = "socialiqa-train-dev"
out_dir = "out"

# Extract SocialIQA data

In [3]:
file_train = os.path.join(cur_dir, "socialiqa-train-dev/train.jsonl")
file_dev = os.path.join(cur_dir, "socialiqa-train-dev/dev.jsonl")

json_train = pd.read_json(path_or_buf=file_train, lines=True)
json_dev = pd.read_json(path_or_buf=file_dev, lines=True)

# list of tuples (context, question, A, B++++, C)
train_data = [elem for elem in zip(json_train['context'].tolist(), 
                                   json_train['question'].tolist(), 
                                   json_train['answerA'].tolist(), 
                                   json_train['answerB'].tolist(), 
                                   json_train['answerC'].tolist())]

dev_data = [elem for elem in zip(json_dev['context'].tolist(), 
                                   json_dev['question'].tolist(), 
                                   json_dev['answerA'].tolist(), 
                                   json_dev['answerB'].tolist(), 
                                   json_dev['answerC'].tolist())]

len(train_data), len(dev_data)

(33410, 1954)

In [4]:
train_labels = []
dev_labels = []
with open(os.path.join(cur_dir, data_dir, "train-labels.lst")) as f:
    for line in f:
      train_labels.append(int(line.split()[0]))

with open(os.path.join(cur_dir, data_dir, "dev-labels.lst")) as f:
    for line in f:
      dev_labels.append(int(line.split()[0]))

train_labels = [label-1 for label in train_labels]
dev_labels = [label-1 for label in dev_labels]

len(train_labels), len(dev_labels)

(33410, 1954)

In [5]:
class SocialiqaDataset(Dataset):
    """
    This dataset class for socialiqa might be able to be generalized for 
    HellaSwag and other tasks.

    This is the context/question + multiple choice format, and each example
    consists of num choices lists of encoded strings. Note that the input will
    be encoded in this stage. prepare_batch will take care of padding across examples
    in the batch-level. 
    """
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (context, question, answer1, answer2, answer3)
        # y: list of indices of the correct answer
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        point = self.x[idx]
        input_context_question = [point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1]]
        input_answers = [point[2], point[3], point[4]]
        encoded_text_train = self.roberta_tokenizer(input_context_question, input_answers, return_tensors='pt', padding=True)
        return (encoded_text_train, self.y[idx])

    def __len__(self):
        return len(self.x)


def prepare_batch_MC(batch, tokenizer):
    """
    This collate function will pad the batch to be the same length. This requires
    flattening, then unflattening for the multiple choice format.
    One example will be a list of length 'num choices', each element being a list
    of (encoded) tokens representing qustion/answer [sep] choicex
    """
    # batch: [batch_size, (text, label)]
    batch_size = len(batch)

    features, labels = zip(*batch)
    # features: tuple of length batch_size, 
    #        each element is a dict with keys = ["input_ids", "attention_mask"]
    # labels: tuple of ints (0, 1, 2) of length batch_size
    num_choices = len(features[0]["input_ids"])
    
    # flatten
    flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
    flattened_features = sum(flattened_features, [])
    # flattened_features list length num_choices*batch_size

    batch = tokenizer.pad(
            flattened_features,
            padding=True,
            return_tensors="pt",
        )
    
    # Un-flatten
    batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
    return (batch, torch.tensor(labels, dtype=torch.int64))


In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset_train = SocialiqaDataset(tokenizer, train_data, train_labels)
dataset_dev = SocialiqaDataset(tokenizer, dev_data, dev_labels)

# Model Class

In [7]:
class Multiple_Choice_Model(nn.Module):
    def __init__(self, roberta_model: RobertaModel, dropout: float = None):
          super(Multiple_Choice_Model, self).__init__()
          self.roberta = roberta_model
          self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob)
          print(f"Initializing with hidden size {self.roberta.config.hidden_size}")
          self.classifier = nn.Linear(self.roberta.config.hidden_size, 1)

    def forward(self, input_ids: torch.tensor, attention_mask: torch.tensor, labels=None):
          num_choices = input_ids.shape[1] 
          flat_input_ids = input_ids.view(-1, input_ids.size(-1))
          flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))

          outputs = self.roberta(
              input_ids = flat_input_ids,
              attention_mask=flat_attention_mask,
          )
          pooled_output = outputs[1] 

          pooled_output = self.dropout(pooled_output)
          logits = self.classifier(pooled_output)
          reshaped_logits = logits.view(-1, num_choices)

          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(reshaped_logits, labels)

          return loss, reshaped_logits

In [8]:
from sklearn.metrics import classification_report

class Trainer(object):
    """
    Trainer for training a multiple choice classification model
    """

    def __init__(self, model, optimizer, device="cpu"):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.device = device

    def _print_summary(self):
        print(self.model)
        print(self.optimizer)

    def train(self, loader):
        """
        Run a single epoch of training
        """

        self.model.train() # Run model in training mode
        loss = None

        epoch_true_labels = []
        epoch_preds = []
        for i, batch in tqdm(enumerate(loader)):
            # clear gradient
            self.optimizer.zero_grad() 

            # input_ids shape: (batch_size, num_choices, sequence_length)
            input_ids = batch[0]['input_ids'].to(self.device)
            # input_ids shape: (batch_size, num_choices, sequence_length)
            attention_mask = batch[0]['attention_mask'].to(self.device)
            # labels shape: (batch_size, )
            labels = batch[1].to(self.device)

            outputs = self.model(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels=labels)
            loss, logits = outputs[0], outputs[1]
            
            epoch_true_labels.extend(labels.tolist())
            epoch_preds.extend(torch.argmax(nn.Softmax(dim=1)(logits), dim=1).tolist())
            
            # back propagation
            loss.backward()
            # do gradient descent
            self.optimizer.step() 

        # Just returning the last loss
        return loss, epoch_true_labels, epoch_preds

    def evaluate(self, loader):
        """
        Evaluate the model on a validation set.
        Only do batch size = 1.
        """

        self.model.eval() # Run model in eval mode (disables dropout layer)
        loss = None

        epoch_true_labels = []
        epoch_preds = []
        with torch.no_grad(): # Disable gradient computation - required only during training
            for i, batch in tqdm(enumerate(loader)):
                # input_ids shape: (batch_size, num_choices, sequence_length)
                input_ids = batch[0]['input_ids'].to(self.device)
                # input_ids shape: (batch_size, num_choices, sequence_length)
                attention_mask = batch[0]['attention_mask'].to(self.device)
                # labels shape: (batch_size, )
                labels = batch[1].to(self.device)

                outputs = self.model(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                labels=labels)
                loss, logits = outputs[0], outputs[1]
                
                epoch_true_labels.extend(labels.tolist())
                epoch_preds.extend(torch.argmax(nn.Softmax(dim=1)(logits), dim=1).tolist())
            
        # Just returning the last loss
        return loss, epoch_true_labels, epoch_preds

    def get_model_dict(self):
        return self.model.state_dict()

    def run_training(self, train_loader, valid_loader, n_epochs=3):
        # Useful for us to review what experiment we're running
        # Normally, you'd want to save this to a file
        # self._print_summary()
        losses_valid = []
        losses_train = []
        best_valid = float("inf")
        for i in range(n_epochs):
            target_names = ['Answer A', 'Answer B', 'Answer C']
            epoch_loss_train, labels, preds = self.train(train_loader)
            print("Train eval")
            print(classification_report(labels, preds, target_names=target_names))

            epoch_loss_valid, labels, preds = self.evaluate(valid_loader)
            print("Valid eval")
            print(classification_report(labels, preds, target_names=target_names))


            if epoch_loss_valid < best_valid:
                best_valid = epoch_loss_valid
                torch.save(self.get_model_dict(), os.path.join(cur_dir, out_dir, "roberta-base-socialiqa", f'model-mc-checkpoint-epoch{i+1}.pt'))
            
            losses_train.append(epoch_loss_train.tolist())
            losses_valid.append(epoch_loss_valid.tolist())
            print(f"Epoch {i}")
            print(f"Train loss: {epoch_loss_train}")
            print(f"Valid loss: {epoch_loss_valid}")

        train_epoch_idx = range(len(losses_train))
        valid_epoch_idx = range(len(losses_valid))
        # sns.lineplot(epoch_idx, all_losses)
        sns.lineplot(train_epoch_idx, losses_train)
        sns.lineplot(valid_epoch_idx, losses_valid)
        plt.show()

In [9]:
roberta_base = RobertaModel.from_pretrained('roberta-base')
mc_model = Multiple_Choice_Model(roberta_base)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 'W' stands for 'Weight Decay fix"
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(mc_model.parameters(), lr=1e-5)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initializing with hidden size 768


In [10]:
train_loader = DataLoader(dataset_train, batch_size=8, shuffle=True, collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))
val_loader = DataLoader(dataset_dev, batch_size=1, shuffle=False)

# Training

In [None]:
trainer_socialiqa = Trainer(mc_model, optimizer, device)
trainer_socialiqa.run_training(train_loader, val_loader, n_epochs=10)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.71      0.70      0.70     11274
    Answer B       0.70      0.70      0.70     11176
    Answer C       0.70      0.71      0.70     10960

    accuracy                           0.70     33410
   macro avg       0.70      0.70      0.70     33410
weighted avg       0.70      0.70      0.70     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.67      0.68      0.67       643
    Answer B       0.68      0.64      0.66       654
    Answer C       0.68      0.71      0.69       657

    accuracy                           0.68      1954
   macro avg       0.68      0.68      0.68      1954
weighted avg       0.68      0.68      0.68      1954

Epoch 0
Train loss: 0.6505564451217651
Valid loss: 0.12656888365745544


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Train eval
              precision    recall  f1-score   support

    Answer A       0.80      0.80      0.80     11274
    Answer B       0.81      0.79      0.80     11176
    Answer C       0.79      0.81      0.80     10960

    accuracy                           0.80     33410
   macro avg       0.80      0.80      0.80     33410
weighted avg       0.80      0.80      0.80     33410



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Valid eval
              precision    recall  f1-score   support

    Answer A       0.71      0.71      0.71       643
    Answer B       0.70      0.68      0.69       654
    Answer C       0.70      0.72      0.71       657

    accuracy                           0.70      1954
   macro avg       0.70      0.70      0.70      1954
weighted avg       0.70      0.70      0.70      1954

Epoch 1
Train loss: 0.37591660022735596
Valid loss: 0.04180977866053581


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [None]:
file_name = 'model-mc-checkpoint-.pt'
torch.save(mc_model.state_dict(), os.path.join(cur_dir, out_dir, file_name))

# Evaluation

In [None]:
mc_model_restored_state_dict = torch.load(os.path.join(cur_dir, out_dir, 'model-mc-checkpoint-epoch10.pt'), )

roberta_base = RobertaModel.from_pretrained('roberta-base')
mc_model_restored = Multiple_Choice_Model(roberta_base)

# Restore state dict to load the same weights again
mc_model_restored.load_state_dict(mc_model_restored_state_dict)

In [None]:
mc_model_restored.to(device)
mc_model_restored.eval()
val_loader = DataLoader(dataset_dev, batch_size=1, shuffle=False)
for i, batch in tqdm(enumerate(val_loader)):
                # input_ids shape: (batch_size, num_choices, sequence_length)
                input_ids = batch[0]['input_ids'].to(device)
                # input_ids shape: (batch_size, num_choices, sequence_length)
                attention_mask = batch[0]['attention_mask'].to(device)
                # labels shape: (batch_size, )
                labels = batch[1].to(device)

                outputs = mc_model_restored(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                labels=labels)
                _, logits = outputs[0], outputs[1]
                print(logits)
                

# Debugging with pretrained Roberta for MC

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMultipleChoice.from_pretrained('roberta-base')

context = "We are talking about pizza"
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels)  # batch size is 1

# the linear classifier still needs to be trained
loss = outputs.loss
logits = outputs.logits
print(logits)