## Platform Check
Ensure we're on an ARM environment. 

In [1]:
import platform

if platform.platform() == 'macOS-13.0-arm64-i386-64bit':
    print(f"We're Armed: {platform.platform()}")
else:
    print(f"WARNING! NOT ARMED: {platform.platform()}")

We're Armed: macOS-13.0-arm64-i386-64bit


## Imports & Settings

In [2]:
import params
from utils import set_seeds
from trainer import *

import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm import trange

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from transformers import RobertaTokenizer, RobertaForSequenceClassification

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

# set logging level
import logging
logging.basicConfig(level='INFO')

# set general seeds
set_seeds(1)

# set dataloader generator seed
g = torch.Generator()
g.manual_seed(1)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x28c8a9970>

## Load Data

### SARC

In [None]:
dataset_path = 'data/SARC/train-balanced-sarcasm.csv'
df = pd.read_csv(dataset_path)
df = df.rename(columns={'comment': 'text'})
df

In [None]:
df.info()

In [None]:
df = df.dropna(subset=['text'])
df.info()

In [None]:
df['label'].value_counts()

In [None]:
sample_amounts = {0:10000, 1:10000}

df = (
    df.groupby('label').apply(lambda g: g.sample(
        # lookup number of samples to take
        n=sample_amounts[g.name],
        # enable replacement if len is less than number of samples expected
        replace=len(g) < sample_amounts[g.name]  
    ))
    .reset_index(drop=True)
)
df['label'].value_counts()

In [None]:
df[["text", "label"]].tail()

### SemEval

In [None]:
dataset_path = 'data/target_semEval2022_en/iSarcasmEval-main/train/train.en.prepped-oversampled.csv'
df = pd.read_csv(dataset_path)
df = df.rename(columns={'tweet': 'text'})
df = df.rename(columns={'sarcastic': 'label'})

df.head()

In [None]:
df.info()

In [None]:
df['label'].value_counts()

In [None]:
sample_amounts = {0: 300, 1:300}

df = (
    df.groupby('label').apply(lambda g: g.sample(
        # lookup number of samples to take
        n=sample_amounts[g.name],
        # enable replacement if len is less than number of samples expected
        replace=len(g) < sample_amounts[g.name]  
    ))
)

In [None]:
df['label'].value_counts()

### Target Text & Labels

In [None]:
text = df.text.values
labels = df.label.values

## Preprocess

In [None]:
token_id = []
attention_masks = []

for sample in text:
  encoding_dict = preprocessing(sample, params.tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

## Data Split
We split the dataset into train (80%) and validation (20%) sets, and wrap them around a torch.utils.data.DataLoader object. With its intuitive syntax, DataLoader provides an iterable over the given dataset.

In [None]:
val_ratio = 0.2

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels,
    random_state=1)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
        )

## Train
It is time for the fine-tuning task:

Select hyperparameters based on the recommendations from the BERT paper¹:
The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks:

- Batch size: 16, 32

- Learning rate (Adam): 5e-5, 3e-5, 2e-5

- Number of epochs: 2, 3, 4

Download transformers.BertForSequenceClassification¹¹, which is a BERT model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [None]:
# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                         num_labels = params.num_labels,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
                                                         )

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Perform the training procedure:

In [None]:
model.to(params.device)
print(f"Trained Dataset: {dataset_path}")
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-05) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  notify=params.notify,
                  phone_number=params.phone_number,
                  save_dir=params.save_dir,
                  model_name=params.model_name, 
                  save_freq=params.save_freq)

In [None]:
trainer.fit()

## Load & Predict

### Full Test

In [None]:
from transformers import TextClassificationPipeline
from transformers import AutoModelForSequenceClassification

PATH = 'model_saves/bert_sarc_long_test/E04_A0.92_F0.91/'
model = AutoModelForSequenceClassification.from_pretrained(PATH, local_files_only=True)
tokenizer = RobertaTokenizer.from_pretrained(PATH, local_files_only=True)

# define pipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=2)

In [None]:
df = pd.read_csv('data/target_semEval2022_en/iSarcasmEval-main/test/task_A_En_test.csv')
df = df.rename(columns={'tweet': 'text'})
df = df.rename(columns={'sarcastic': 'label'})

df.head()

In [None]:
test_input = df['text'].to_list()

test_output = []

# run tests and append to output
with tqdm(test_input, unit="test") as prog:
    for step, test in enumerate(prog):
        prog.set_description(f"Test {step}")
        test_output.append(pipe(test)[0])

In [None]:
# parse predictions to new list
predictions = []

for i in test_output:
    predictions.append(i[0]['label'])
    
print(len(predictions))

In [None]:
df['preds'] = predictions
df["preds"] = df["preds"].str.replace("LABEL_","")
df['preds'] = df["preds"].astype(int)
df.tail()

In [None]:
df.info()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# epoch 3
acc = accuracy_score(df['label'], df['preds'])
f1 = f1_score(df['label'], df['preds'])

print(acc)
print(f1)