## Platform Check
Ensure we're on an ARM environment. 

In [1]:
import platform

if platform.platform() == 'macOS-13.0-arm64-i386-64bit':
    print(f"We're Armed: {platform.platform()}")
else:
    print(f"WARNING! NOT ARMED: {platform.platform()}")

'macOS-13.0-arm64-i386-64bit'

## Settings & Imports

First, update working directory to parent so that we may use our custom functions

In [None]:
import os
 
os.chdir('..')
os.getcwd( )

In [None]:
import params
from trainer import *

import random
import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm import trange


import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from transformers import set_seed
from transformers import RobertaTokenizer, RobertaForSequenceClassification
# from transformers import BertTokenizer, BertForSequenceClassification

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

import logging
logging.basicConfig(level='INFO')

## Seeds

In [None]:
torch.manual_seed(1)
random.seed(1)
np.random.seed(1)
set_seed(1)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(1)


## Load Data

### SARC

In [None]:
dataset_path = 'data/SARC/train-balanced-sarcasm.csv'
df = pd.read_csv(dataset_path)
df = df.rename(columns={'comment': 'text'})
df

In [None]:
df.info()

In [None]:
df = df.dropna(subset=['text'])
df.info()

In [None]:
df['label'].value_counts()

In [None]:
sample_amounts = {0:10000, 1:10000}

df = (
    df.groupby('label').apply(lambda g: g.sample(
        # lookup number of samples to take
        n=sample_amounts[g.name],
        # enable replacement if len is less than number of samples expected
        replace=len(g) < sample_amounts[g.name]  
    ))
    .reset_index(drop=True)
)
df['label'].value_counts()

In [None]:
df[["text", "label"]].tail()

### SemEval

In [None]:
dataset_path = 'data/target_semEval2022_en/iSarcasmEval-main/train/train.en.prepped-oversampled.csv'
df = pd.read_csv(dataset_path)
df = df.rename(columns={'tweet': 'text'})
df = df.rename(columns={'sarcastic': 'label'})

df.head()

In [None]:
df.info()

In [None]:
df['label'].value_counts()

In [None]:
sample_amounts = {0: 300, 1:300}

df = (
    df.groupby('label').apply(lambda g: g.sample(
        # lookup number of samples to take
        n=sample_amounts[g.name],
        # enable replacement if len is less than number of samples expected
        replace=len(g) < sample_amounts[g.name]  
    ))
)

In [None]:
df['label'].value_counts()

### Target Text & Labels

In [None]:
text = df.text.values
labels = df.label.values

## Preprocess

In [None]:
token_id = []
attention_masks = []

# TODO change max_length
def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = params.max_length,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, params.tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

We can observe the token IDs for a text sample and recognize the presence of the special tokens [CLS] and [SEP], as well as the padding [PAD] up to the desired max_length:

In [None]:
# token_id[6]

## Data Split
We split the dataset into train (80%) and validation (20%) sets, and wrap them around a torch.utils.data.DataLoader object. With its intuitive syntax, DataLoader provides an iterable over the given dataset.

In [None]:
val_ratio = 0.2

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels,
    random_state=1)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
        )

## Train
It is time for the fine-tuning task:

Select hyperparameters based on the recommendations from the BERT paper¹:
The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks:

- Batch size: 16, 32

- Learning rate (Adam): 5e-5, 3e-5, 2e-5

- Number of epochs: 2, 3, 4

Download transformers.BertForSequenceClassification¹¹, which is a BERT model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [None]:
# # Load the BertForSequenceClassification model
# model = BertForSequenceClassification.from_pretrained(
#     'bert-base-uncased',
#     num_labels = 2,
#     output_attentions = False,
#     output_hidden_states = False,
# )

# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels = params.num_labels,
    output_attentions = False,
    output_hidden_states = False,
)

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Note: it is preferable to run this notebook in the presence of GPU. In order to execute it on CPU, we should comment model.cuda() in the above snippet to avoid a runtime error.

Perform the training procedure:

In [None]:
model.to(params.device)
print(f"Trained Dataset: {dataset_path}")
print(f"Device: {params.device}")

optimizer    = torch.optim.Adam(params=model.parameters(), lr=1e-05) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  notify=params.notify,
                  phone_number=params.phone_number,
                  save_dir=params.save_dir,
                  model_name=params.model_name, 
                  save_freq=params.save_freq)

In [None]:
trainer.fit()

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

print(f"Device: {device}")
model.to(device)

print(f"Trained Dataset: {dataset_path}")

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
# optimizer = torch.optim.AdamW(model.parameters(), 
#                               lr = 5e-5,
#                             #   lr = .01,
#                               eps = 1e-08
#                               )

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-05) #roberta

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 10
notify = False

# loss function for validation loop
val_loss_fn = nn.CrossEntropyLoss()

# def compute_loss(logits, inputs, return_outputs=False):
#     logits =logits.to('mps')
#     labels = inputs.to('mps')
#     # compute custom loss (suppose one has 3 labels with different weights)
#     loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.6667, 1.9994])).to('mps')
#     loss = loss_fct(logits.view(-1, 2).to('mps'), labels.view(-1).to('mps'))
#     return loss

for epoch in range(1, epochs+1):
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    # tqdm for progress bars
    with tqdm(train_dataloader, unit="batch") as tepoch:
        for step, batch in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")

            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            optimizer.zero_grad()
            # Forward pass
            train_output = model(b_input_ids, 
                                token_type_ids = None, 
                                attention_mask = b_input_mask, 
                                labels = b_labels)
            
            # training_loss = compute_loss(train_output.logits, b_labels) # new loss

            # Backward pass
            train_output.loss.backward()
            # training_loss.backward() # new loss

            # # gradient clipping set to 5.0 in line with E. Savini Paper
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            # Update tracking variables
            tr_loss += train_output.loss.item()
            # tr_loss += training_loss.item() # new loss
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        # --- VALIDATE -------------------------------------
        val_loss, val_acc, val_f1, val_recall, val_precision = validate(model, validation_dataloader, device, val_loss_fn)
        print('\t - Train loss: {:.6f}'.format(tr_loss / nb_tr_steps))
        print('\t - Validation Loss: {:.6f}'.format(val_loss))
        print('\t - Validation Accuracy: {:.6f}'.format(val_acc))
        print('\t - Validation F1: {:.6f}'.format(val_f1))
        print('\t - Validation Recall: {:.6f}'.format(val_recall))
        print('\t - Validation Precision: {:.6f}'.format(val_precision))
        # ----------- SAVE -----------
        save_dir = "model_saves/"
        model_name = "bert_sarc_long_test"
        save_freq = 1
        save_model(epoch, model, tokenizer, save_dir, model_name, save_freq, val_acc, val_f1)

        # ----------- NOTIFY -----------
        if notify == True:
            phone_number = "+573042084792"
            message = f"{model_name} epoch {epoch}:\nAccuracy: {round(val_acc, 2)} \nF1: {round(val_f1, 2)}"
            send_message(phone_number, message)

## Load & Predict

### Full Test

In [None]:
from transformers import TextClassificationPipeline
from transformers import AutoModelForSequenceClassification

PATH = 'model_saves/bert_sarc_long_test/E04_A0.92_F0.91/'
model = AutoModelForSequenceClassification.from_pretrained(PATH, local_files_only=True)
tokenizer = RobertaTokenizer.from_pretrained(PATH, local_files_only=True)

# define pipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=2)

In [None]:
df = pd.read_csv('data/target_semEval2022_en/iSarcasmEval-main/test/task_A_En_test.csv')
df = df.rename(columns={'tweet': 'text'})
df = df.rename(columns={'sarcastic': 'label'})

df.head()

In [None]:
test_input = df['text'].to_list()

test_output = []

# run tests and append to output
with tqdm(test_input, unit="test") as prog:
    for step, test in enumerate(prog):
        prog.set_description(f"Test {step}")
        test_output.append(pipe(test)[0])

In [None]:
# parse predictions to new list
predictions = []

for i in test_output:
    predictions.append(i[0]['label'])
    
print(len(predictions))

In [None]:
df['preds'] = predictions
df["preds"] = df["preds"].str.replace("LABEL_","")
df['preds'] = df["preds"].astype(int)
df.tail()

In [None]:
df.info()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# epoch 3
acc = accuracy_score(df['label'], df['preds'])
f1 = f1_score(df['label'], df['preds'])

print(acc)
print(f1)