# control_iSarcasm
This notebook trains our control model--a model that is only fine-tuned on our target task, iSarcasm.

## Platform Check
Ensure we're on an ARM environment. 

In [1]:
import platform

if "arm" in platform.platform():
    print(f"We're Armed: {platform.platform()}")
else:
    print(f"WARNING! NOT ARMED: {platform.platform()}")

We're Armed: macOS-13.0-arm64-i386-64bit


## Imports & Settings

First, update working directory to parent so that we may use our custom functions

In [1]:
import os
os.chdir('..')
# os.getcwd( )

In [3]:
import params
from utils import *
from trainer import *

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

# set logging level
import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x294a4a730>

In [None]:
# set general seeds
set_seeds(1)

# set dataloader generator seed
g = torch.Generator()
g.manual_seed(1)

# set params for this model
params.num_labels = 2
params.output_dir = "model_saves/control_iSarcasm_03"

# Ensure we're on an ARM environment if necessary.
platform_check()

## Load Data

### iSarcasm

In [4]:
dataset_path = 'data/target_semEval2022_en/iSarcasmEval-main/train/train.en.prepped-oversampled.csv'
df = pd.read_csv(dataset_path)
df = df.rename(columns={'tweet': 'text'})
df = df.rename(columns={'sarcastic': 'label'})

df.head()

Unnamed: 0,text,label
0,@ThePartridgePod Defrost the freezer with the ...,1
1,really honoured to have shared a coach today w...,1
2,I can't believe today is the last day we can b...,1
3,"thank you kind person that stole my ATM card, ...",1
4,I love leaving the doctors office in tears. I ...,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5200 non-null   object
 1   label   5200 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 81.4+ KB


In [6]:
df['label'].value_counts()

1    2600
0    2600
Name: label, dtype: int64

#### Sampling
The Sampling function below allows us to adjust the size of our dataset for quick testing.

In [None]:
sample_amounts = {0: 300, 1:300}

df = (
    df.groupby('label').apply(lambda g: g.sample(
        # lookup number of samples to take
        n=sample_amounts[g.name],
        # enable replacement if len is less than number of samples expected
        replace=len(g) < sample_amounts[g.name]  
    ))
)

In [None]:
df['label'].value_counts()

### Target Text & Labels

In [7]:
df['label'].value_counts()

1    2600
0    2600
Name: label, dtype: int64

In [8]:
text = df.text.values
labels = df.label.values

## Preprocess

In [9]:
token_id = []
attention_masks = []

for sample in text:
  encoding_dict = preprocessing(sample, params.tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

## Data Split
We split the dataset into train (80%) and validation (20%) sets, and wrap them around a torch.utils.data.DataLoader object.

In [10]:
val_ratio = 0.2

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels,
    random_state=1)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = RandomSampler(val_set),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
        )

## Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [11]:
# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                         num_labels = params.num_labels,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
                                                         )

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Layer (type:depth-idx)                                       Output Shape              Param #
RobertaForSequenceClassification                             [1, 2]                    --
├─RobertaModel: 1-1                                          [1, 512, 768]             --
│    └─RobertaEmbeddings: 2-1                                [1, 512, 768]             --
│    │    └─Embedding: 3-1                                   [1, 512, 768]             38,603,520
│    │    └─Embedding: 3-2                                   [1, 512, 768]             768
│    │    └─Embedding: 3-3                                   [1, 512, 768]             394,752
│    │    └─LayerNorm: 3-4                                   [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                     [1, 512, 768]             --
│    └─RobertaEncoder: 2-2                                   [1, 512, 768]             --
│    │    └─ModuleList: 3-6                                  --               

Set model to device, initialize trainer

In [12]:
model.to(params.device)
print(f"Trained Dataset: {dataset_path}")
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), lr=params.learning_rate) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq)

Trained Dataset: data/target_semEval2022_en/iSarcasmEval-main/train/train.en.prepped-oversampled.csv
Device: mps


Fit the model to our training data.

In [13]:
trainer.fit()

  incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
Epoch 1: 100%|██████████| 260/260 [05:46<00:00,  1.33s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:25<00:00,  2.50batch/s]


	 - Train loss: 0.683326
	 - Validation Loss: 0.609513
	 - Validation Accuracy: 0.663462
	 - Validation F1: 0.590991
	 - Validation Recall: 0.521418
	 - Validation Precision: 0.717979
	 * Model @ epoch 1 saved to model_saves/iSarcasm_control_03/E01_A0.66_F0.59


Epoch 2: 100%|██████████| 260/260 [05:45<00:00,  1.33s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.49batch/s]


	 - Train loss: 0.500766
	 - Validation Loss: 0.444631
	 - Validation Accuracy: 0.800962
	 - Validation F1: 0.815006
	 - Validation Recall: 0.939009
	 - Validation Precision: 0.738140
	 * Model @ epoch 2 saved to model_saves/iSarcasm_control_03/E02_A0.8_F0.82


Epoch 3: 100%|██████████| 260/260 [05:46<00:00,  1.33s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.47batch/s]


	 - Train loss: 0.248728
	 - Validation Loss: 0.341265
	 - Validation Accuracy: 0.866346
	 - Validation F1: 0.865319
	 - Validation Recall: 0.918410
	 - Validation Precision: 0.829450
	 * Model @ epoch 3 saved to model_saves/iSarcasm_control_03/E03_A0.87_F0.87


Epoch 4: 100%|██████████| 260/260 [05:45<00:00,  1.33s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.50batch/s]


	 - Train loss: 0.123816
	 - Validation Loss: 0.278364
	 - Validation Accuracy: 0.914423
	 - Validation F1: 0.908326
	 - Validation Recall: 0.941225
	 - Validation Precision: 0.886106
	 * Model @ epoch 4 saved to model_saves/iSarcasm_control_03/E04_A0.91_F0.91


Epoch 5: 100%|██████████| 260/260 [05:47<00:00,  1.34s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.50batch/s]


	 - Train loss: 0.054288
	 - Validation Loss: 0.329762
	 - Validation Accuracy: 0.915385
	 - Validation F1: 0.908635
	 - Validation Recall: 0.906205
	 - Validation Precision: 0.921562
	 * Model @ epoch 5 saved to model_saves/iSarcasm_control_03/E05_A0.92_F0.91


Epoch 6: 100%|██████████| 260/260 [05:48<00:00,  1.34s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.48batch/s]


	 - Train loss: 0.052947
	 - Validation Loss: 0.491207
	 - Validation Accuracy: 0.880769
	 - Validation F1: 0.881543
	 - Validation Recall: 0.949040
	 - Validation Precision: 0.834958
	 * Model @ epoch 6 saved to model_saves/iSarcasm_control_03/E06_A0.88_F0.88


Epoch 7: 100%|██████████| 260/260 [05:48<00:00,  1.34s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.47batch/s]


	 - Train loss: 0.031083
	 - Validation Loss: 0.566721
	 - Validation Accuracy: 0.860577
	 - Validation F1: 0.862899
	 - Validation Recall: 0.954315
	 - Validation Precision: 0.799051
	 * Model @ epoch 7 saved to model_saves/iSarcasm_control_03/E07_A0.86_F0.86


Epoch 8: 100%|██████████| 260/260 [05:49<00:00,  1.34s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.46batch/s]


	 - Train loss: 0.031855
	 - Validation Loss: 0.369116
	 - Validation Accuracy: 0.903846
	 - Validation F1: 0.900232
	 - Validation Recall: 0.940780
	 - Validation Precision: 0.872365
	 * Model @ epoch 8 saved to model_saves/iSarcasm_control_03/E08_A0.9_F0.9


Epoch 9: 100%|██████████| 260/260 [05:50<00:00,  1.35s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.47batch/s]


	 - Train loss: 0.020477
	 - Validation Loss: 0.328691
	 - Validation Accuracy: 0.924038
	 - Validation F1: 0.919266
	 - Validation Recall: 0.931683
	 - Validation Precision: 0.914593
	 * Model @ epoch 9 saved to model_saves/iSarcasm_control_03/E09_A0.92_F0.92


Epoch 10: 100%|██████████| 260/260 [05:49<00:00,  1.35s/batch]
	 Validation 64: 100%|██████████| 65/65 [00:26<00:00,  2.47batch/s]


	 - Train loss: 0.023232
	 - Validation Loss: 0.406462
	 - Validation Accuracy: 0.915385
	 - Validation F1: 0.911268
	 - Validation Recall: 0.939150
	 - Validation Precision: 0.894255
	 * Model @ epoch 10 saved to model_saves/iSarcasm_control_03/E10_A0.92_F0.91


## Load & Predict

### Full Test

In [None]:
from transformers import TextClassificationPipeline
from transformers import AutoModelForSequenceClassification

PATH = 'model_saves/bert_sarc_long_test/E04_A0.92_F0.91/'
model = AutoModelForSequenceClassification.from_pretrained(PATH, local_files_only=True)
tokenizer = RobertaTokenizer.from_pretrained(PATH, local_files_only=True)

# define pipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=2)

In [None]:
df = pd.read_csv('data/target_semEval2022_en/iSarcasmEval-main/test/task_A_En_test.csv')
df = df.rename(columns={'tweet': 'text'})
df = df.rename(columns={'sarcastic': 'label'})

df.head()

In [None]:
test_input = df['text'].to_list()

test_output = []

# run tests and append to output
with tqdm(test_input, unit="test") as prog:
    for step, test in enumerate(prog):
        prog.set_description(f"Test {step}")
        test_output.append(pipe(test)[0])

In [None]:
# parse predictions to new list
predictions = []

for i in test_output:
    predictions.append(i[0]['label'])
    
print(len(predictions))

In [None]:
df['preds'] = predictions
df["preds"] = df["preds"].str.replace("LABEL_","")
df['preds'] = df["preds"].astype(int)
df.tail()

In [None]:
df.info()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# epoch 3
acc = accuracy_score(df['label'], df['preds'])
f1 = f1_score(df['label'], df['preds'])

print(acc)
print(f1)

In [None]:
print(1e-05)
print(type(1e-05))