# target_iSarcasm
This notebook takes models trained on the intermediate tasks and fine-tunes them further on our target task, iSarcasm.

## Imports & Settings

First, update working directory to parent so that we may use our custom functions

In [1]:
import os
os.chdir('..')
# os.getcwd( )

In [2]:
import params
from utils import *
from trainer import *

import numpy as np
import pandas as pd
from datasets import load_from_disk

from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

# set logging level
import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

In [3]:
# set general seeds
set_seeds(1)

# set dataloader generator seed
g = torch.Generator()
g.manual_seed(1)

# load best intermediate/control model
# path format example:  "model_saves/intermediate_XED_binary_01/E03_A0.83_F0.82"
intermediate_model_path: str = "model_saves/intermediate_IMDB_01/E02_A0.94_F0.94"


# Ensure we're on an ARM environment if necessary.
platform_check()

We're Armed: macOS-13.1-arm64-i386-64bit


## Load Data

### iSarcasm

In [4]:
params.dataset_path = "data/target_iSarcasmEval/itesd_iSarcasmEval_balanced.hf"

datasets = load_from_disk(params.dataset_path)
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4266
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 628
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
})

In [5]:
# we will need to view and prep the datasets
# this is more easily done as dataframes
train_df = datasets['train'].to_pandas()
validate_df = datasets['validation'].to_pandas() 

In [6]:
# view training dataset
print("train_df Info:")
print(train_df.info())
print("\ntrain_df Value Counts")
print(train_df['label'].value_counts())

train_df Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4266 entries, 0 to 4265
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4266 non-null   object
 1   label   4266 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 66.8+ KB
None

train_df Value Counts
1    2133
0    2133
Name: label, dtype: int64


In [7]:
# view validation dataset
print("validate_df Info:")
print(validate_df.info())
print("\n validate_df Value Counts")
print(validate_df['label'].value_counts())

validate_df Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 628 entries, 0 to 627
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    628 non-null    object
 1   label   628 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 9.9+ KB
None

 validate_df Value Counts
0    467
1    161
Name: label, dtype: int64


### Preprocess

In [8]:
# update tokenizer to use trained model's tokenizer
params.tokenizer = RobertaTokenizer.from_pretrained(intermediate_model_path, local_files_only=True)

print(params.tokenizer)


PreTrainedTokenizer(name_or_path='model_saves/intermediate_IMDB_01/E02_A0.94_F0.94', vocab_size=50265, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})


In [9]:
train_token_ids, train_attention_masks = encode_text(train_df.text.values)
validate_token_ids, validate_attention_masks = encode_text(validate_df.text.values)

In [10]:
train_features = []
for i in range(len(train_token_ids)):
    train_features.append({'label': train_df.label.values[i], 
                           'input_ids': train_token_ids[i], 
                           'attention_mask':train_attention_masks[i]})

validate_features = []
for i in range(len(validate_token_ids)):
    validate_features.append({'label': validate_df.label.values[i], 
                              'input_ids': validate_token_ids[i],
                              'attention_mask':validate_attention_masks[i]})

### Data Split
We split the dataset into train (80%) and validation (20%) sets, and wrap them around a torch.utils.data.DataLoader object.

In [11]:
# Prepare DataLoader
train_dataloader = DataLoader(
            train_features,
            sampler = RandomSampler(train_features),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
            collate_fn=collate
        )

validation_dataloader = DataLoader(
            validate_features,
            sampler = RandomSampler(validate_features),
            batch_size = params.batch_size,
            worker_init_fn=seed_worker,
            generator=g,
            collate_fn=collate
        )

## intermediate_XED_binary

In [12]:
# set params for this model
params.num_labels = 2
params.output_dir = "model_saves/target-iSarcasm_inter-XED-binary_03"

### Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [13]:
# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(intermediate_model_path,
                                                         num_labels = params.num_labels,
                                                         local_files_only=True, 
                                                         ignore_mismatched_sizes=True,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
                                                         )

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Layer (type:depth-idx)                                       Output Shape              Param #
RobertaForSequenceClassification                             [1, 2]                    --
├─RobertaModel: 1-1                                          [1, 512, 768]             --
│    └─RobertaEmbeddings: 2-1                                [1, 512, 768]             --
│    │    └─Embedding: 3-1                                   [1, 512, 768]             38,603,520
│    │    └─Embedding: 3-2                                   [1, 512, 768]             768
│    │    └─Embedding: 3-3                                   [1, 512, 768]             394,752
│    │    └─LayerNorm: 3-4                                   [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                     [1, 512, 768]             --
│    └─RobertaEncoder: 2-2                                   [1, 512, 768]             --
│    │    └─ModuleList: 3-6                                  --               

Set model to device, initialize trainer

In [14]:
model.to(params.device)
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), 
                             lr=params.learning_rate,
                             weight_decay=params.weight_decay) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq)

output_parameters()

Device: mps

          Training Dataset: data/target_iSarcasmEval/itesd_iSarcasmEval_balanced.hf
          Number of Labels: 2
          Batch Size: 16
          Learning Rate: 1e-05
          Weight Decay: 0
          Epochs: 10
          Output Directory: model_saves/target-iSarcasm_inter-XED-binary_03
          Save Frequency: 1
          Checkpoint Frequency: 1
          Max Length: 256
          


Fit the model to our training data.

In [15]:
trainer.fit()

Epoch 1: 100%|██████████| 267/267 [02:04<00:00,  2.14batch/s]
	 Validation 39: 100%|██████████| 40/40 [00:07<00:00,  5.53batch/s]

 	 - Train loss: 0.640834
	 - Validation Loss: 0.620484
	 - Validation Accuracy: 0.662500
	 - Validation F1: 0.439320
	 - Validation Recall: 0.569881
	 - Validation Precision: 0.382054 

	 * Model @ epoch 1 saved to model_saves/target-iSarcasm_inter-XED-binary_03/E01_A0.66_F0.44
	 * Model checkpoint saved to model_saves/target-iSarcasm_inter-XED-binary_03/E01_A0.66_F0.44/checkpoint.pt

Epoch 2: 100%|██████████| 267/267 [01:53<00:00,  2.36batch/s]
	 Validation 39: 100%|██████████| 40/40 [00:05<00:00,  6.91batch/s]

 	 - Train loss: 0.399995
	 - Validation Loss: 0.649387
	 - Validation Accuracy: 0.725000
	 - Validation F1: 0.468172
	 - Validation Recall: 0.520298
	 - Validation Precision: 0.455119 

	 * Model @ epoch 2 saved to model_saves/target-iSarcasm_inter-XED-binary_03/E02_A0.72_F0.47
	 * Model checkpoint saved to model_saves/target-iSarcasm_inter-XED-b

## intermediate_XED_fine

In [12]:
# set params for this model
params.num_labels = 2
params.output_dir = "model_saves/target-iSarcasm_inter-XED-fine_03"

### Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [13]:
# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(intermediate_model_path,
                                                         num_labels = params.num_labels,
                                                         local_files_only=True, 
                                                         ignore_mismatched_sizes=True,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
                                                         )

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Layer (type:depth-idx)                                       Output Shape              Param #
RobertaForSequenceClassification                             [1, 2]                    --
├─RobertaModel: 1-1                                          [1, 512, 768]             --
│    └─RobertaEmbeddings: 2-1                                [1, 512, 768]             --
│    │    └─Embedding: 3-1                                   [1, 512, 768]             38,603,520
│    │    └─Embedding: 3-2                                   [1, 512, 768]             768
│    │    └─Embedding: 3-3                                   [1, 512, 768]             394,752
│    │    └─LayerNorm: 3-4                                   [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                     [1, 512, 768]             --
│    └─RobertaEncoder: 2-2                                   [1, 512, 768]             --
│    │    └─ModuleList: 3-6                                  --               

Set model to device, initialize trainer

In [14]:
model.to(params.device)
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), 
                             lr=params.learning_rate,
                             weight_decay=params.weight_decay) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq)

output_parameters()

Device: mps

          Training Dataset: data/target_iSarcasmEval/itesd_iSarcasmEval_balanced.hf
          Number of Labels: 2
          Batch Size: 16
          Learning Rate: 1e-05
          Weight Decay: 0
          Epochs: 10
          Output Directory: model_saves/target-iSarcasm_inter-XED-fine_03
          Save Frequency: 1
          Checkpoint Frequency: 1
          Max Length: 256
          


Fit the model to our training data.

In [15]:
trainer.fit()

Epoch 1: 100%|██████████| 267/267 [02:06<00:00,  2.10batch/s]
	 Validation 39: 100%|██████████| 40/40 [00:07<00:00,  5.48batch/s]

 	 - Train loss: 0.607637
	 - Validation Loss: 0.536672
	 - Validation Accuracy: 0.732812
	 - Validation F1: 0.485368
	 - Validation Recall: 0.530298
	 - Validation Precision: 0.481915 

	 * Model @ epoch 1 saved to model_saves/target-iSarcasm_inter-XED-fine_03/E01_A0.73_F0.49
	 * Model checkpoint saved to model_saves/target-iSarcasm_inter-XED-fine_03/E01_A0.73_F0.49/checkpoint.pt

Epoch 2: 100%|██████████| 267/267 [01:53<00:00,  2.36batch/s]
	 Validation 39: 100%|██████████| 40/40 [00:05<00:00,  6.95batch/s]

 	 - Train loss: 0.350186
	 - Validation Loss: 0.553531
	 - Validation Accuracy: 0.765625
	 - Validation F1: 0.471622
	 - Validation Recall: 0.467560
	 - Validation Precision: 0.541448 

	 * Model @ epoch 2 saved to model_saves/target-iSarcasm_inter-XED-fine_03/E02_A0.77_F0.47
	 * Model checkpoint saved to model_saves/target-iSarcasm_inter-XED-fine_03

## intermediate_SARC

In [12]:
# set params for this model
params.num_labels = 2
params.output_dir = "model_saves/target-iSarcasm_inter-SARC_03"

### Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [13]:
# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(intermediate_model_path,
                                                         num_labels = params.num_labels,
                                                         local_files_only=True, 
                                                         ignore_mismatched_sizes=True,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
                                                         )

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Layer (type:depth-idx)                                       Output Shape              Param #
RobertaForSequenceClassification                             [1, 2]                    --
├─RobertaModel: 1-1                                          [1, 512, 768]             --
│    └─RobertaEmbeddings: 2-1                                [1, 512, 768]             --
│    │    └─Embedding: 3-1                                   [1, 512, 768]             38,603,520
│    │    └─Embedding: 3-2                                   [1, 512, 768]             768
│    │    └─Embedding: 3-3                                   [1, 512, 768]             394,752
│    │    └─LayerNorm: 3-4                                   [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                     [1, 512, 768]             --
│    └─RobertaEncoder: 2-2                                   [1, 512, 768]             --
│    │    └─ModuleList: 3-6                                  --               

Set model to device, initialize trainer

In [14]:
model.to(params.device)
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), 
                             lr=params.learning_rate,
                             weight_decay=params.weight_decay) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq)

output_parameters()

Device: mps

          Training Dataset: data/target_iSarcasmEval/itesd_iSarcasmEval_balanced.hf
          Number of Labels: 2
          Batch Size: 16
          Learning Rate: 1e-05
          Weight Decay: 0
          Epochs: 10
          Output Directory: model_saves/target-iSarcasm_inter-SARC_03
          Save Frequency: 1
          Checkpoint Frequency: 1
          Max Length: 256
          


Fit the model to our training data.

In [15]:
trainer.fit()

Epoch 1: 100%|██████████| 267/267 [02:06<00:00,  2.11batch/s]
	 Validation 39: 100%|██████████| 40/40 [00:07<00:00,  5.42batch/s]

 	 - Train loss: 0.568651
	 - Validation Loss: 0.531797
	 - Validation Accuracy: 0.753125
	 - Validation F1: 0.467552
	 - Validation Recall: 0.468988
	 - Validation Precision: 0.511607 

	 * Model @ epoch 1 saved to model_saves/target-iSarcasm_inter-SARC_03/E01_A0.75_F0.47
	 * Model checkpoint saved to model_saves/target-iSarcasm_inter-SARC_03/E01_A0.75_F0.47/checkpoint.pt

Epoch 2: 100%|██████████| 267/267 [01:55<00:00,  2.31batch/s]
	 Validation 39: 100%|██████████| 40/40 [00:05<00:00,  6.85batch/s]

 	 - Train loss: 0.291537
	 - Validation Loss: 0.622932
	 - Validation Accuracy: 0.787500
	 - Validation F1: 0.481081
	 - Validation Recall: 0.489226
	 - Validation Precision: 0.549107 

	 * Model @ epoch 2 saved to model_saves/target-iSarcasm_inter-SARC_03/E02_A0.79_F0.48
	 * Model checkpoint saved to model_saves/target-iSarcasm_inter-SARC_03/E02_A0.79_F0.48

## intermediate_IMDB

In [12]:
# set params for this model
params.num_labels = 2
params.output_dir = "model_saves/target-iSarcasm_inter-IMDB_03"

### Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [13]:
# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(intermediate_model_path,
                                                         num_labels = params.num_labels,
                                                         local_files_only=True, 
                                                         ignore_mismatched_sizes=True,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
                                                         )

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Layer (type:depth-idx)                                       Output Shape              Param #
RobertaForSequenceClassification                             [1, 2]                    --
├─RobertaModel: 1-1                                          [1, 512, 768]             --
│    └─RobertaEmbeddings: 2-1                                [1, 512, 768]             --
│    │    └─Embedding: 3-1                                   [1, 512, 768]             38,603,520
│    │    └─Embedding: 3-2                                   [1, 512, 768]             768
│    │    └─Embedding: 3-3                                   [1, 512, 768]             394,752
│    │    └─LayerNorm: 3-4                                   [1, 512, 768]             1,536
│    │    └─Dropout: 3-5                                     [1, 512, 768]             --
│    └─RobertaEncoder: 2-2                                   [1, 512, 768]             --
│    │    └─ModuleList: 3-6                                  --               

Set model to device, initialize trainer

In [14]:
model.to(params.device)
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), 
                             lr=params.learning_rate,
                             weight_decay=params.weight_decay) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq)

output_parameters()

Device: mps

          Training Dataset: data/target_iSarcasmEval/itesd_iSarcasmEval_balanced.hf
          Number of Labels: 2
          Batch Size: 16
          Learning Rate: 1e-05
          Weight Decay: 0
          Epochs: 10
          Output Directory: model_saves/target-iSarcasm_inter-IMDB_03
          Save Frequency: 1
          Checkpoint Frequency: 1
          Max Length: 256
          


Fit the model to our training data.

In [15]:
trainer.fit()

Epoch 1: 100%|██████████| 267/267 [02:06<00:00,  2.10batch/s]
	 Validation 39: 100%|██████████| 40/40 [00:07<00:00,  5.47batch/s]

 	 - Train loss: 0.702509
	 - Validation Loss: 0.689584
	 - Validation Accuracy: 0.465625
	 - Validation F1: 0.403497
	 - Validation Recall: 0.712857
	 - Validation Precision: 0.292078 

	 * Model @ epoch 1 saved to model_saves/target-iSarcasm_inter-IMDB_03/E01_A0.47_F0.4
	 * Model checkpoint saved to model_saves/target-iSarcasm_inter-IMDB_03/E01_A0.47_F0.4/checkpoint.pt

Epoch 2: 100%|██████████| 267/267 [02:11<00:00,  2.03batch/s]
	 Validation 39: 100%|██████████| 40/40 [00:05<00:00,  6.89batch/s]

 	 - Train loss: 0.602461
	 - Validation Loss: 0.593465
	 - Validation Accuracy: 0.689063
	 - Validation F1: 0.430380
	 - Validation Recall: 0.509048
	 - Validation Precision: 0.429881 

	 * Model @ epoch 2 saved to model_saves/target-iSarcasm_inter-IMDB_03/E02_A0.69_F0.43
	 * Model checkpoint saved to model_saves/target-iSarcasm_inter-IMDB_03/E02_A0.69_F0.43/c

## intermediate_hellaswag

In [None]:
# set params for this model
params.num_labels = 2
params.output_dir = "model_saves/target-iSarcasm_inter-hellaswag_03"

### Train

Download transformers.RobertaForSequenceClassificatio, which is a RoBERTa model with a linear layer for sentence classification (or regression) on top of the pooled output:

In [None]:
# Load the RobertaForSequenceClassification model
model = RobertaForSequenceClassification.from_pretrained(intermediate_model_path,
                                                         num_labels = params.num_labels,
                                                         local_files_only=True, 
                                                         ignore_mismatched_sizes=True,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
                                                         )

from torchinfo import summary
summary(model, input_size=(1, 512), dtypes=['torch.IntTensor'])

Set model to device, initialize trainer

In [None]:
model.to(params.device)
print(f"Device: {params.device}")

optimizer = torch.optim.Adam(params=model.parameters(), 
                             lr=params.learning_rate,
                             weight_decay=params.weight_decay) #roberta

trainer = Trainer(model=model,
                  device=params.device,
                  tokenizer=params.tokenizer,
                  train_dataloader=train_dataloader,
                  validation_dataloader=validation_dataloader,
                  epochs=params.epochs,
                  optimizer=optimizer,
                  val_loss_fn=params.val_loss_fn,
                  num_labels=params.num_labels,
                  output_dir=params.output_dir,
                  save_freq=params.save_freq,
                  checkpoint_freq=params.checkpoint_freq)

output_parameters()

Fit the model to our training data.

In [None]:
trainer.fit()