# intermediate_hellaswag
This notebook takes our hellaswag dataset and trains an intermediate model.

## Platform Check
Ensure we're on an ARM environment. 

In [1]:
import platform

if platform.platform() == 'macOS-13.0-arm64-i386-64bit':
    print(f"We're Armed: {platform.platform()}")
else:
    print(f"WARNING! NOT ARMED: {platform.platform()}")

We're Armed: macOS-13.0-arm64-i386-64bit


## Imports & Settings

First, update working directory to parent so that we may use our custom functions

In [2]:
import os
 
os.chdir('..')
os.getcwd( )

'/Users/jarradjinx/Library/Mobile Documents/com~apple~CloudDocs/EDU_leeds/LD_research/LD_ITESD'

In [3]:
import params
from utils import *
from trainer import *

import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm import trange

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from datasets import load_from_disk, load_metric
from transformers import RobertaTokenizer, RobertaForSequenceClassification
# from transformers import BertTokenizer, BertForSequenceClassification

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

# set logging level
import logging
logging.basicConfig(level='INFO')

# set general seeds
set_seeds(1)

# set dataloader generator seed
g = torch.Generator()
g.manual_seed(1)

<torch._C.Generator at 0x2987760f0>

## Load Data

### hellaswag

In [4]:
# "../data/inter_HellaSwag/hellaswag.hf"
hellaswag_datasets = load_from_disk("data/inter_HellaSwag/hellaswag.hf")

In [5]:
hellaswag_datasets

DatasetDict({
    train: Dataset({
        features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label', 'ending0', 'ending1', 'ending2', 'ending3'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label', 'ending0', 'ending1', 'ending2', 'ending3'],
        num_rows: 10042
    })
})

In [6]:
def show_one(example):
    print(f"Context: {example['ctx_a']}")
    print(f"  A - {example['ctx_b']} {example['ending0']}")
    print(f"  B - {example['ctx_b']} {example['ending1']}")
    print(f"  C - {example['ctx_b']} {example['ending2']}")
    print(f"  D - {example['ctx_b']} {example['ending3']}")
    print(f"\nGround truth: option {['A', 'B', 'C', 'D'][example['label']]}")

show_one(hellaswag_datasets["train"][250])

Context: A group of athletes row on canoes during a race in between buoys on a waterway.
  A - the men pass over a wooden structure in the river.
  B - the men paddle while crashing through endless waves in the river.
  C - the men cross the final numbered buoys and glide while slowing down after the race.
  D - the men go over large cliffs into a lagoon.

Ground truth: option C


## Preprocess

In [7]:
# sample the dataset
examples = hellaswag_datasets["train"][:50]
labels = hellaswag_datasets['train']['label'][:50]

# use full dataset
# for some reason, setting these values using slice notation makes preprocessing MUCH faster
# examples = hellaswag_datasets["train"][:]
# labels = hellaswag_datasets['train']['label'][:]


In [8]:
ending_names = ["ending0", "ending1", "ending2", "ending3"]

# for sample in examples:
encoding_dict = mc_preprocessing(examples, params.tokenizer, ending_names)

labels = torch.tensor(labels)

In [9]:
encoding_dict

token_id = encoding_dict['input_ids']
attention_masks = encoding_dict['attention_mask']

In [None]:
token_id

In [None]:
token_id[0]

In [10]:
token_id = torch.stack(token_id, 0)
attention_masks = torch.stack(attention_masks, 0)

In [None]:
token_id

In [None]:
token_id[0]

In [11]:
val_ratio = 0.2

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels,
    random_state=1)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])
                                                
# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
        )

In [16]:
type(next(iter(train_dataloader)))

list

In [19]:
next(iter(train_dataloader))

[tensor([[[   0,  170,  192,  ...,    1,    1,    1],
          [   0,  170,  192,  ...,    1,    1,    1],
          [   0,  170,  192,  ...,    1,    1,    1],
          [   0,  170,  192,  ...,    1,    1,    1]],
 
         [[   0, 4763, 1413,  ...,    1,    1,    1],
          [   0, 4763, 1413,  ...,    1,    1,    1],
          [   0, 4763, 1413,  ...,    1,    1,    1],
          [   0, 4763, 1413,  ...,    1,    1,    1]],
 
         [[   0,  133,  693,  ...,    1,    1,    1],
          [   0,  133,  693,  ...,    1,    1,    1],
          [   0,  133,  693,  ...,    1,    1,    1],
          [   0,  133,  693,  ...,    1,    1,    1]],
 
         ...,
 
         [[   0,  250,  410,  ...,    1,    1,    1],
          [   0,  250,  410,  ...,    1,    1,    1],
          [   0,  250,  410,  ...,    1,    1,    1],
          [   0,  250,  410,  ...,    1,    1,    1]],
 
         [[   0, 1620,   37,  ...,    1,    1,    1],
          [   0, 1620,   37,  ...,    1,    1,    1],


## Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [13]:
# from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers import RobertaTokenizer, RobertaForMultipleChoice


# Load the RobertaForSequenceClassification model
model = RobertaForMultipleChoice.from_pretrained('roberta-base',
                                                  num_labels = params.num_labels,
                                                  output_attentions = False,
                                                  output_hidden_states = False,
                                                    )

from torchinfo import summary
summary(model, input_size=(1, 4, 256), dtypes=['torch.IntTensor'])

Layer (type:depth-idx)                                       Output Shape              Param #
RobertaForMultipleChoice                                     [1, 4]                    --
├─RobertaModel: 1-1                                          [4, 768]                  --
│    └─RobertaEmbeddings: 2-1                                [4, 256, 768]             --
│    │    └─Embedding: 3-1                                   [4, 256, 768]             38,603,520
│    │    └─Embedding: 3-2                                   [4, 256, 768]             768
│    │    └─Embedding: 3-3                                   [4, 256, 768]             394,752
│    │    └─LayerNorm: 3-4                                   [4, 256, 768]             1,536
│    │    └─Dropout: 3-5                                     [4, 256, 768]             --
│    └─RobertaEncoder: 2-2                                   [4, 256, 768]             --
│    │    └─ModuleList: 3-6                                  --               

Set model to device, initialize trainer

In [14]:
model.to(params.device)
# print(f"Trained Dataset: {dataset_path}")
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), lr=params.learning_rate) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  notify=params.notify,
                  phone_number=params.phone_number,
                  save_dir=params.save_dir,
                  model_name=params.model_name, 
                  save_freq=params.save_freq)

Device: mps


In [15]:
trainer.fit()

  incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
Epoch 1:  33%|███▎      | 1/3 [00:32<01:04, 32.07s/batch]


KeyboardInterrupt: 