# intermediate_XED_binary
This notebook takes our custom XED binary dataset and trains an intermediate model.

## Imports & Settings

First, update working directory to parent so that we may use our custom functions

In [None]:
import os
os.chdir('..')
# os.getcwd( )

In [None]:
import params
from utils import *
from trainer import *

import numpy as np
import pandas as pd
from datasets import load_from_disk

from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

# set logging level
import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

In [None]:
# set general seeds
set_seeds(1)

# set dataloader generator seed
g = torch.Generator()
g.manual_seed(1)

# set params for this model
params.num_labels = 2
params.output_dir = "model_saves/intermediate_XED_binary_01"
params.dataset_path = "data/inter_XED/itesd_xed_binary_balanced.hf"

# Ensure we're on an ARM environment if necessary.
platform_check()

## Load Data

### Binary XED

In [None]:
datasets = load_from_disk(params.dataset_path)
datasets

In [None]:
# we will need to view and prep the datasets
# this is more easily done as dataframes
train_df = datasets['train'].to_pandas()
validate_df = datasets['validation'].to_pandas() 

In [None]:
# view training dataset
print("train_df Info:")
print(train_df.info())
print("\ntrain_df Value Counts")
print(train_df['label'].value_counts())

In [None]:
# view validation dataset
print("validate_df Info:")
print(validate_df.info())
print("\n validate_df Value Counts")
print(validate_df['label'].value_counts())

## Preprocess

In [None]:
params.tokenizer = RobertaTokenizer.from_pretrained("roberta-base", use_fast=True)
encoded_datasets = datasets.map(preprocessing_dyna, batched=True)

encoded_datasets

In [None]:
train_features = construct_input(encoded_datasets['train'])
validate_features = construct_input(encoded_datasets['validation'])

## Data Split
We split the dataset into train (80%) and validation (20%) sets, and wrap them around a torch.utils.data.DataLoader object.

In [None]:
# Prepare DataLoader
train_dataloader = DataLoader(
            train_features,
            sampler = RandomSampler(train_features),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
            collate_fn=collate
        )

validation_dataloader = DataLoader(
            validate_features,
            sampler = RandomSampler(validate_features),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
            collate_fn=collate
        )

In [None]:
# view an example from the dataloader
next(iter(train_dataloader))

## Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [None]:
model.classifier

In [None]:
# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                         num_labels = params.num_labels,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
                                                         )

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Set model to device, initialize trainer

In [None]:
model.to(params.device)
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), 
                             lr=params.learning_rate,
                             weight_decay=params.weight_decay) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq)

output_parameters()

Fit the model to our training data.

In [None]:
trainer.fit()

In [None]:
device_name = "mps"
torch.backends.device_name.is_available()

In [None]:
dataset_path = "data/inter_XED/itesd_xed_binary_balanced.hf"
self.data_type = "multiple choice" or "sequence classification"
self.datasets = load_from_disk(dataset_path)


# hwo to distignusih between sequence classification and mc_
# can i get that info from self.model?
# encoded_datasets = datasets.map(preprocessing_dyna, batched=True)


# train_features = construct_input(encoded_datasets['train'])
# validate_features = construct_input(encoded_datasets['validation'])

#TODO fix this so that it acatually sets device before self.device is called?
self.device = torch.device(device_name)
self.optimizer = self.configure_optimizer()

# def get_device(self, device_name):
#     device = torch.device(device_name if torch.backends.mps.is_available() else 'cpu')
        
#     if device != device_name:
#         logging.info(f"{device_name} not available, falling back to {device}")
        
#     return device

def configure_optimizer(self):
    optimizer = torch.optim.Adam(params=self.model.parameters(), 
                                lr=self.learning_rate,
                                weight_decay=self.weight_decay)
    return optimizer

def configure_model(self):
    if self.data_type == "multiple_choice":
        model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                                      num_labels = params.num_labels,
                                                                      output_attentions = False,
                                                                      output_hidden_states = False,
                                                                      )
    elif self.data_type == "sequence_classification":
        # Load the RobertaForSequenceClassification model
        model = RobertaForMultipleChoice.from_pretrained('roberta-base',
                                                              num_labels = params.num_labels,
                                                              output_attentions = False,
                                                              output_hidden_states = False,
                                                              )
    
    model.to(self.device)
    summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])
    return model


def train_dataloader(self) -> DataLoader:
    return self.get_dataloader(self.datasets, "train")

def val_dataloader(self) -> DataLoader:
    return self.get_dataloader(self.datasets, "validation")

def get_dataloader(self, datasets, split) -> DataLoader:
    
    if self.data_type == "multiple_choice":
        encoded_datasets = datasets.map(mc_preprocessing, batched=True)
    elif self.data_type == "sequence_classification":
        encoded_datasets = datasets.map(preprocessing_dyna, batched=True)
    
    features = construct_input(encoded_datasets[split])
    
    dataloader = DataLoader(
            features,
            sampler = RandomSampler(features),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
            collate_fn=collate
            )
    
    return dataloader

In [1]:
import os
os.chdir('..')
# os.getcwd( )

from trainer import *

trainer = Trainer(dataset_path="data/inter_XED/itesd_xed_binary_balanced.hf",
                  data_type="sequence_classification",
                  device="mps",
                  tokenizer=RobertaTokenizer.from_pretrained("roberta-base"),
                  epochs=10,
                  num_labels=2,
                  output_dir="model_saves/intermediate_XED_binary_01",
                  save_freq=1,
                  checkpoint_freq=1)

In [2]:
trainer.fit()

Loading cached processed dataset at data/inter_XED/itesd_xed_binary_balanced.hf/train/cache-bfb267995ec8d8a6.arrow
Loading cached processed dataset at data/inter_XED/itesd_xed_binary_balanced.hf/validation/cache-9e2b6461106d45c0.arrow
Loading cached processed dataset at data/inter_XED/itesd_xed_binary_balanced.hf/test/cache-b4c35a7ecbe89f5a.arrow
Epoch 1:   2%|▏         | 8/506 [00:03<03:50,  2.16batch/s]


KeyboardInterrupt: 