# intermediate_hellaswag
This notebook takes our hellaswag dataset and trains an intermediate model.

## Imports & Settings

First, update working directory to parent so that we may use our custom functions

In [None]:
import os
os.chdir('..')
# os.getcwd( )

In [None]:
import params
from utils import *
from trainer import *

import numpy as np
import pandas as pd
from datasets import load_from_disk

from transformers import RobertaTokenizer, RobertaForMultipleChoice
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

# set logging level
import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

In [None]:
# set general seeds
set_seeds(1)

# set dataloader generator seed
g = torch.Generator()
g.manual_seed(1)

# set params for this model
params.num_labels = 4
params.output_dir = "model_saves/intermediate_hellaswag_01"

# Ensure we're on an ARM environment if necessary.
platform_check()

## Load Data

### hellaswag

In [None]:
hellaswag_datasets = load_from_disk("data/inter_HellaSwag/itesd_hellaswag_balanced.hf")

In [None]:
hellaswag_datasets

In [None]:
def show_one(example):
    print(f"Context: {example['ctx_a']}")
    print(f"  A - {example['ctx_b']} {example['ending0']}")
    print(f"  B - {example['ctx_b']} {example['ending1']}")
    print(f"  C - {example['ctx_b']} {example['ending2']}")
    print(f"  D - {example['ctx_b']} {example['ending3']}")
    print(f"\nGround truth: option {['A', 'B', 'C', 'D'][example['label']]}")

show_one(hellaswag_datasets["train"][50])

## Preprocess

In [None]:
params.tokenizer = RobertaTokenizer.from_pretrained("roberta-base", use_fast=True)

In [None]:
# encoding_dict
encoded_datasets = hellaswag_datasets.map(hella_preprocessing, batched=True)

encoded_datasets

### Double-Check input_id lengths
We're performing this check to ensure that 256 max token length is sufficient for this task.

In [None]:
train_ids = encoded_datasets["train"]['input_ids']

lengths = []
for i in train_ids:
    for j in i:
        lengths.append(len(j))

print(len(lengths))

In [None]:
max(lengths)

### View input Structure

The inputs are four copies of cxt_a and ctx_b each strung together with one ending option. They start with the \<s> BOS token, which may act as the CLS token instead, and are separated with the \</s> token--end of sequence or separator token.

https://huggingface.co/docs/transformers/model_doc/roberta
https://stackoverflow.com/questions/61465223/roberta-tokenization-of-multiple-sequences

In [None]:
show_one(hellaswag_datasets["train"][0])

In [None]:
print(params.tokenizer.decode(encoded_datasets['train']["input_ids"][0][0]))
print(params.tokenizer.decode(encoded_datasets['train']["input_ids"][0][1]))
print(params.tokenizer.decode(encoded_datasets['train']["input_ids"][0][2]))
print(params.tokenizer.decode(encoded_datasets['train']["input_ids"][0][3]))

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]

train_number_samples = len(encoded_datasets['train'])
val_number_samples = len(encoded_datasets['validation'])

train_features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(train_number_samples)]
validate_features = [{k: v for k, v in encoded_datasets["validation"][i].items() if k in accepted_keys} for i in range(val_number_samples)]

In [None]:
# dataloaders w collation
# Prepare DataLoader
train_dataloader = DataLoader(
            train_features,
            sampler = RandomSampler(train_features),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
            collate_fn=mc_collate
        )

validation_dataloader = DataLoader(
            validate_features,
            sampler = RandomSampler(validate_features),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
            collate_fn=mc_collate
        )

In [None]:
# view an example from the dataloader
next(iter(train_dataloader))

## Train

* Note: if continuing from checkpoint, continue to next section

Download transformers.RobertaForSequenceClassification, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [None]:
# Load the RobertaForSequenceClassification model
model = RobertaForMultipleChoice.from_pretrained('roberta-base',
                                                  num_labels = params.num_labels,
                                                  output_attentions = False,
                                                  output_hidden_states = False,
                                                    )

from torchinfo import summary
summary(model, input_size=(1, 4, 256), dtypes=['torch.IntTensor'])

Set model to device, initialize trainer

In [None]:
model.to(params.device)
# print(f"Trained Dataset: {dataset_path}")
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), lr=params.learning_rate) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq)

In [None]:
trainer.fit()

## Continue Training from Checkpoint

In [None]:
# Load the RobertaForSequenceClassification model
model = RobertaForMultipleChoice.from_pretrained('roberta-base',
                                                  num_labels = params.num_labels,
                                                  output_attentions = False,
                                                  output_hidden_states = False,
                                                    )

model.to(params.device)
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), lr=params.learning_rate) #roberta

checkpoint_load = "model_saves/intermediate_hellaswag_01/E08_A0.61_F0.61/checkpoint.pt"
trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq, 
                  checkpoint_load=checkpoint_load)

In [None]:
trainer.fit()