# intermediate_SARC
This notebook takes our custom XED binary datas_et and trains an intermediate model.

## Platform Check
Ensure we're on an ARM environment. 

In [1]:
import platform

if platform.platform() == 'macOS-13.0-arm64-i386-64bit':
    print(f"We're Armed: {platform.platform()}")
else:
    print(f"WARNING! NOT ARMED: {platform.platform()}")

We're Armed: macOS-13.0-arm64-i386-64bit


## Imports & Settings

First, update working directory to parent so that we may use our custom functions

In [2]:
import os
 
os.chdir('..')
os.getcwd( )

'/Users/jarradjinx/Library/Mobile Documents/com~apple~CloudDocs/EDU_leeds/LD_research/LD_ITESD'

In [4]:
import params
from utils import *
from trainer import *

import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm import trange

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from datasets import load_from_disk, load_metric
from transformers import RobertaTokenizer, RobertaForSequenceClassification
# from transformers import BertTokenizer, BertForSequenceClassification

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

# set logging level
import logging
logging.basicConfig(level='INFO')

# set general seeds
set_seeds(1)

# set dataloader generator seed
g = torch.Generator()
g.manual_seed(1)

<torch._C.Generator at 0x29481afb0>

## Load Data

### SARC

In [5]:
# "../data/inter_HellaSwag/hellaswag.hf"
hellaswag_datasets = load_from_disk("data/inter_HellaSwag/hellaswag.hf")

In [6]:
hellaswag_datasets

DatasetDict({
    train: Dataset({
        features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label', 'ending0', 'ending1', 'ending2', 'ending3'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label', 'ending0', 'ending1', 'ending2', 'ending3'],
        num_rows: 10042
    })
})

In [7]:
def show_one(example):
    print(f"Context: {example['ctx_a']}")
    print(f"  A - {example['ctx_b']} {example['ending0']}")
    print(f"  B - {example['ctx_b']} {example['ending1']}")
    print(f"  C - {example['ctx_b']} {example['ending2']}")
    print(f"  D - {example['ctx_b']} {example['ending3']}")
    print(f"\nGround truth: option {['A', 'B', 'C', 'D'][example['label']]}")

show_one(hellaswag_datasets["train"][250])

Context: A group of athletes row on canoes during a race in between buoys on a waterway.
  A - the men pass over a wooden structure in the river.
  B - the men paddle while crashing through endless waves in the river.
  C - the men cross the final numbered buoys and glide while slowing down after the race.
  D - the men go over large cliffs into a lagoon.

Ground truth: option C


## Preprocess

In [8]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base", use_fast=True)

In [9]:
ending_names = ["ending0", "ending1", "ending2", "ending3"]

def preprocess_function(examples):
    # Repeat each first sentence four times to go with the four possibilities of second sentences.
    first_sentences = [[context] * 4 for context in examples["ctx_a"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["ctx_b"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [10]:
examples = hellaswag_datasets["train"][:5]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

5 4 [40, 55, 44, 39]


In [11]:
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(4)]

['<s>A tray of potatoes is loaded into the oven and removed. A large tray of cake is flipped over and placed on counter.</s></s>a large tray of meat is placed onto a baked potato.</s>',
 '<s>A tray of potatoes is loaded into the oven and removed. A large tray of cake is flipped over and placed on counter.</s></s>a large tray of meat, ls, and pickles are placed in the oven.</s>',
 '<s>A tray of potatoes is loaded into the oven and removed. A large tray of cake is flipped over and placed on counter.</s></s>a large tray of meat is poured into a midden.</s>',
 '<s>A tray of potatoes is loaded into the oven and removed. A large tray of cake is flipped over and placed on counter.</s></s>a large tray of meat is prepared then it is removed from the oven by a helper when done.</s>']

In [12]:
show_one(hellaswag_datasets["train"][3])

Context: A tray of potatoes is loaded into the oven and removed. A large tray of cake is flipped over and placed on counter.
  A - a large tray of meat is placed onto a baked potato.
  B - a large tray of meat , ls, and pickles are placed in the oven.
  C - a large tray of meat is poured into a midden.
  D - a large tray of meat is prepared then it is removed from the oven by a helper when done.

Ground truth: option D


In [13]:
encoded_datasets = hellaswag_datasets.map(preprocess_function, batched=True)



In [14]:
encoded_datasets

DatasetDict({
    train: Dataset({
        features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label', 'ending0', 'ending1', 'ending2', 'ending3', 'input_ids', 'attention_mask'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label', 'ending0', 'ending1', 'ending2', 'ending3', 'input_ids', 'attention_mask'],
        num_rows: 10042
    })
})

## Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [15]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaForMultipleChoice

# Load the RobertaForSequenceClassification model
model = RobertaForMultipleChoice.from_pretrained('roberta-base')

# from torchinfo import summary
# summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Set model to device, initialize trainer

In [16]:
args = TrainingArguments(
    f"roberta-finetuned-hellaswag-test",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [17]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [18]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(5)]
features

[{'label': 3,
  'input_ids': [[0,
    12948,
    6,
    5,
    313,
    5789,
    81,
    5,
    1958,
    4631,
    5,
    2931,
    9,
    10,
    512,
    6,
    8,
    10,
    693,
    2498,
    2608,
    5418,
    14504,
    4,
    2,
    2,
    13040,
    2156,
    5,
    313,
    3639,
    19957,
    7,
    5,
    26689,
    8,
    2599,
    24,
    4,
    2],
   [0,
    12948,
    6,
    5,
    313,
    5789,
    81,
    5,
    1958,
    4631,
    5,
    2931,
    9,
    10,
    512,
    6,
    8,
    10,
    693,
    2498,
    2608,
    5418,
    14504,
    4,
    2,
    2,
    13040,
    2156,
    10,
    621,
    792,
    10,
    10485,
    5258,
    6,
    150,
    80,
    604,
    3117,
    5,
    471,
    9,
    5,
    621,
    2498,
    2608,
    5418,
    1958,
    25,
    5,
    52,
    1972,
    27095,
    4,
    2],
   [0,
    12948,
    6,
    5,
    313,
    5789,
    81,
    5,
    1958,
    4631,
    5,
    2931,
    9,
    10,
    512,
    6,
    8,
    10,
    

In [None]:
for feature in features:
    print(feature)

In [None]:
labels = [feature.pop("label") for feature in features]

In [None]:
labels

In [22]:
num_choices = len(features[0]["input_ids"])
print(num_choices)
flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
print('')
print(flattened_features)
flattened_features = sum(flattened_features, [])
print('')
print(flattened_features)

4

[[{'input_ids': [0, 12948, 6, 5, 313, 5789, 81, 5, 1958, 4631, 5, 2931, 9, 10, 512, 6, 8, 10, 693, 2498, 2608, 5418, 14504, 4, 2, 2, 13040, 2156, 5, 313, 3639, 19957, 7, 5, 26689, 8, 2599, 24, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, {'input_ids': [0, 12948, 6, 5, 313, 5789, 81, 5, 1958, 4631, 5, 2931, 9, 10, 512, 6, 8, 10, 693, 2498, 2608, 5418, 14504, 4, 2, 2, 13040, 2156, 10, 621, 792, 10, 10485, 5258, 6, 150, 80, 604, 3117, 5, 471, 9, 5, 621, 2498, 2608, 5418, 1958, 25, 5, 52, 1972, 27095, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, {'input_ids': [0, 12948, 6, 5, 313, 5789, 81, 5, 1958, 4631, 5, 2931, 9, 10, 512, 6, 8, 10, 693, 2498, 2608, 5418, 14504, 4, 2, 2, 13040, 2156, 5, 313, 4650, 15, 10, 29224, 13738, 9540, 6, 11269, 16

In [24]:
batch_2 = tokenizer.pad(
    flattened_features,
    max_length = params.max_length,
    padding='max_length',
    return_tensors="pt",
)

In [26]:
# OK SO THE GOAL IS TO 1PAD THE INPUTS AND ZERO PAD THE ATTENTION MASKS
print(batch_2)

{'input_ids': tensor([[    0, 12948,     6,  ...,     1,     1,     1],
        [    0, 12948,     6,  ...,     1,     1,     1],
        [    0, 12948,     6,  ...,     1,     1,     1],
        ...,
        [    0,   133,   313,  ...,     1,     1,     1],
        [    0,   133,   313,  ...,     1,     1,     1],
        [    0,   133,   313,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [19]:
batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [21]:
batch['attention_mask'][0]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]])

In [None]:
batch['input_ids'][0]

In [None]:
[tokenizer.decode(batch["input_ids"][0][i].tolist()) for i in range(4)]

In [None]:
# labels are 0 indexed, so this is the last label
batch["labels"]

In [None]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
model.to(params.device)
print(f"Trained Dataset: {dataset_path}")
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), lr=params.learning_rate) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  notify=params.notify,
                  phone_number=params.phone_number,
                  save_dir=params.save_dir,
                  model_name=params.model_name, 
                  save_freq=params.save_freq)

Fit the model to our training data.

In [None]:
trainer.fit()