In [1]:
import os
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

cur_dir = "/content/drive/MyDrive/Colab_Notebooks/NLP_244_Advanced_ML/final_project_socialiqa/socialiqa-nlp244"
data_dir = "socialiqa-train-dev"
out_dir = "out"

!pip install transformers

Mounted at /content/drive
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 9.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 36.4MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 36.0MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0

File for trying out models on SocialIQA

In [2]:
file_train = os.path.join(cur_dir, "socialiqa-train-dev/train.jsonl")
file_dev = os.path.join(cur_dir, "socialiqa-train-dev/dev.jsonl")

json_train = pd.read_json(path_or_buf=file_train, lines=True)
json_dev = pd.read_json(path_or_buf=file_dev, lines=True)

# list of tuples (context, question, A, B++++, C)
train_data = [elem for elem in zip(json_train['context'].tolist(), 
                                   json_train['question'].tolist(), 
                                   json_train['answerA'].tolist(), 
                                   json_train['answerB'].tolist(), 
                                   json_train['answerC'].tolist())]

dev_data = [elem for elem in zip(json_dev['context'].tolist(), 
                                   json_dev['question'].tolist(), 
                                   json_dev['answerA'].tolist(), 
                                   json_dev['answerB'].tolist(), 
                                   json_dev['answerC'].tolist())]

len(train_data), len(dev_data)

(33410, 1954)

In [3]:
train_labels = []
dev_labels = []
with open(os.path.join(cur_dir, data_dir, "train-labels.lst")) as f:
    for line in f:
      train_labels.append(int(line.split()[0]))

with open(os.path.join(cur_dir, data_dir, "dev-labels.lst")) as f:
    for line in f:
      dev_labels.append(int(line.split()[0]))

train_labels = [label-1 for label in train_labels]
dev_labels = [label-1 for label in dev_labels]

len(train_labels), len(dev_labels)

(33410, 1954)

In [4]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

from transformers import RobertaModel, RobertaTokenizer

In [None]:
class SocialiqaDataset(Dataset):
    """
    This dataset class for socialiqa might be able to be generalized for 
    HellaSwag and other tasks.

    This is the context/question + multiple choice format, and each example
    consists of num choices lists of encoded strings. Note that the input will
    be encoded in this stage. prepare_batch will take care of padding across examples
    in the batch-level. 
    """
    def __init__(self, tokenizer, x, y):
        # x: list of tuples containing (context, question, answer1, answer2, answer3)
        # y: list of indices of the correct answer
        self.roberta_tokenizer = tokenizer
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        point = self.x[idx]
        input_context_question = [point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1], point[0] + self.roberta_tokenizer.sep_token + self.roberta_tokenizer.sep_token + point[1]]
        input_answers = [point[2], point[3], point[4]]
        encoded_text_train = self.roberta_tokenizer(input_context_question, input_answers, return_tensors='pt', padding=True)
        return (encoded_text_train, self.y[idx])

    def __len__(self):
        return len(self.x)


def prepare_batch_MC(batch, tokenizer):
    """
    This collate function will pad the batch to be the same length. This requires
    flattening, then unflattening for the multiple choice format.
    One example will be a list of length 'num choices', each element being a list
    of (encoded) tokens representing qustion/answer [sep] choicex
    """
    # batch: [batch_size, (text, label)]
    batch_size = len(batch)
    print(f"Batch size: {batch_size}")

    features, labels = zip(*batch)
    # features: tuple of length batch_size, 
    #        each element is a dict with keys = ["input_ids", "attention_mask"]
    # labels: tuple of ints (0, 1, 2) of length batch_size
    num_choices = len(features[0]["input_ids"])
    
    # flatten
    flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
    flattened_features = sum(flattened_features, [])
    # flattened_features list length num_choices*batch_size

    batch = tokenizer.pad(
            flattened_features,
            padding=True,
            return_tensors="pt",
        )
    
    # Un-flatten
    batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
    return (batch, torch.tensor(labels, dtype=torch.int64))


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset_train = SocialiqaDataset(tokenizer, train_data, train_labels)
dataset_dev = SocialiqaDataset(tokenizer, dev_data, dev_labels)

In [None]:
x = dataset_train.__getitem__(0)[0]
y = dataset_train.__getitem__(0)[1]
x, y

({'input_ids': tensor([[    0,   347, 35953,  1276,     7,    33,    10, 18906,     8,  4366,
             69,   964,   561,     4,     2,     2,  6179,    74,  5763,   619,
             25,    10,   898,   116,     2,     2,  3341,  5190,     2,     1,
              1,     1],
         [    0,   347, 35953,  1276,     7,    33,    10, 18906,     8,  4366,
             69,   964,   561,     4,     2,     2,  6179,    74,  5763,   619,
             25,    10,   898,   116,     2,     2,  3341,  4959,   184,     2,
              1,     1],
         [    0,   347, 35953,  1276,     7,    33,    10, 18906,     8,  4366,
             69,   964,   561,     4,     2,     2,  6179,    74,  5763,   619,
             25,    10,   898,   116,     2,     2,   102,   205,  1441,     7,
             33,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
print(tokenizer.decode(x['input_ids'][0]), "\n", tokenizer.decode(x['input_ids'][1]), "\n", tokenizer.decode(x['input_ids'][2]))

<s>Cameron decided to have a barbecue and gathered her friends together.</s></s>How would Others feel as a result?</s></s>like attending</s><pad><pad><pad> 
 <s>Cameron decided to have a barbecue and gathered her friends together.</s></s>How would Others feel as a result?</s></s>like staying home</s><pad><pad> 
 <s>Cameron decided to have a barbecue and gathered her friends together.</s></s>How would Others feel as a result?</s></s>a good friend to have</s>


In [None]:
train_loader = DataLoader(dataset_train, batch_size=2, shuffle=True, collate_fn=lambda batch: prepare_batch_MC(batch, tokenizer))
#val_loader = DataLoader(dataset_dev, batch_size=1, shuffle=False)


In [None]:
for i, batch in enumerate(train_loader):
    print(batch)
    break


Batch size: 2
({'input_ids': tensor([[[    0,   102,  1792,  5460,   439,  5651,    19,    69,  4252,     8,
           2037,    10,  3539,    25,    10,  2916,     4,     2,     2,  7608,
            222, 17095,  5460,   109,    42,   116,     2,     2,   757, 13447,
             69,  4252,     2,     1,     1,     1],
         [    0,   102,  1792,  5460,   439,  5651,    19,    69,  4252,     8,
           2037,    10,  3539,    25,    10,  2916,     4,     2,     2,  7608,
            222, 17095,  5460,   109,    42,   116,     2,     2, 11990,  1531,
              2,     1,     1,     1,     1,     1],
         [    0,   102,  1792,  5460,   439,  5651,    19,    69,  4252,     8,
           2037,    10,  3539,    25,    10,  2916,     4,     2,     2,  7608,
            222, 17095,  5460,   109,    42,   116,     2,     2,   225, 20768,
              5,  3539,     2,     1,     1,     1]],

        [[    0,   104, 15144,   956,     7,   422,    10,  2119, 22379,   463,
          

In [None]:
class Multiple_Choice_Model(nn.Module):
    def __init__(self, roberta_model: RobertaModel, dropout: float):
          super(Multiple_Choice_Model, self).__init__()
          self.roberta = roberta_model
          self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob)
          self.classifier = nn.Linear(self.config.hidden_size, 1)

    def forward(self, input_ids: torch.tensor, attention_mask: torch.tensor, labels=None):
          num_choices = input_ids.shape[1] 
          flat_input_ids = input_ids.view(-1, input_ids.size(-1))
          flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))

          outputs = self.roberta(
              flat_input_ids,
              attention_mask=flat_attention_mask,
          )
          pooled_output = outputs[1] 

          pooled_output = self.dropout(pooled_output)
          logits = self.classifier(pooled_output)
          reshaped_logits = logits.view(-1, num_choices)

          loss = None
          if labels is not None:
              loss_fct = nn.CrossEntropyLoss()
              loss = loss_fct(reshaped_logits, labels)

          return loss, reshaped_logits


In [None]:
class Trainer(object):
    """
    Trainer for training a joint multi-label classification and NER model
    """

    def __init__(self, model, optimizer, device="cpu"):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.device = device

    def _print_summary(self):
        print(self.model)
        print(self.optimizer)

    def train(self, loader):
        """
        Run a single epoch of training
        """

        self.model.train() # Run model in training mode
        slot_loss = None
        relation_loss = None
        for i, batch in tqdm(enumerate(loader)):
            # clear gradient
            self.optimizer.zero_grad() 

            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            slot_labels = batch['slot_labels'].to(self.device)
            relation_labels = batch['relation_labels'].to(self.device)
            outputs = model(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            relation_labels=relation_labels,
                            slot_labels=slot_labels)
            slot_loss, relation_loss = outputs[2], outputs[3]
            
            # back propagation
            slot_loss.backward(retain_graph=True) #need to retain_graph  when working with multiple losses
            relation_loss.backward()
            # do gradient descent
            self.optimizer.step() 

        # Just returning the last loss
        return slot_loss, relation_loss

    def evaluate(self, loader):
        """
        Evaluate the model on a validation set.
        Only do batch size = 1.
        """

        self.model.eval() # Run model in eval mode (disables dropout layer)
        slot_loss = None
        relation_loss = None
        with torch.no_grad(): # Disable gradient computation - required only during training
            for i, batch in tqdm(enumerate(loader)):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                slot_labels = batch['slot_labels'].to(self.device)
                relation_labels = batch['relation_labels'].to(self.device)
                outputs = model(input_ids=input_ids, 
                                attention_mask=attention_mask,
                                relation_labels=relation_labels,
                                slot_labels=slot_labels)
                relation_logits, slot_logits = outputs[0], outputs[1]
                slot_loss, relation_loss = outputs[2], outputs[3]

        # Just returning the last loss
        return slot_loss, relation_loss

    def get_model_dict(self):
        return self.model.state_dict()

    def run_training(self, train_loader, valid_loader, n_epochs=3):
        # Useful for us to review what experiment we're running
        # Normally, you'd want to save this to a file
        #self._print_summary()

        for i in range(n_epochs):
            epoch_slot_loss_train, epoch_relation_loss_train = self.train(train_loader)
            epoch_slot_loss_valid, epoch_relation_loss_valid = self.evaluate(valid_loader)
            print(f"Epoch {i}")
            print(f"Train loss: {epoch_slot_loss_train} (slot), {epoch_relation_loss_train} (relation) ")
            print(f"Valid loss: {epoch_slot_loss_valid} (slot), {epoch_relation_loss_valid} (relation) ")