In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import transformers
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, BertTokenizer, BertModel
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup
from torch.nn.utils import clip_grad_norm
import gc
from transformers.adapters import AutoAdapterModel, RobertaAdapterModel
from transformers import RobertaTokenizer
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

In [3]:
train_fl = pd.read_csv("sar_and_meta_train.csv")[:20000]
test_fl = pd.read_csv("sar_and_meta_test.csv")

In [4]:
train_fl

Unnamed: 0,label,text
0,1,P: Or they think what are the odds it'll happe...
1,0,Instead of having the uperclass already paying...
2,0,P: Nope. She killed Ronnie twice. But then aga...
3,0,"P: Yeah, Jon is Dany's brother's son; C: So th..."
4,0,Poverty engenders a social environment charact...
...,...,...
19995,1,P: Bavaria is like Germany's Texas. If they do...
19996,1,"P: Dude, that's what breasts are for. Why else..."
19997,0,P: then give it you cheap fuck; C: Why on eart...
19998,1,"P: I wish they did this to TF2, that would be ..."


In [5]:
val_size=0.1
seed=42

In [6]:
train_fl, val_fl = train_test_split(train_fl, test_size=val_size, random_state=seed)

In [7]:
train_fl

Unnamed: 0,label,text
16270,0,Civilian Gun Control is a distraction from the...
1383,1,"P: Geez, did you get a souvenir photo with tha..."
3472,0,P: Something tells me they were using abnormal...
19900,0,"As far as I'm concerned, the NRA showed itself..."
2150,0,"P: ""Look at this, this is my cocaine fort. No ..."
...,...,...
11284,1,P: As a brit - 10/10 would queue again; C: sti...
11964,1,P: my car is worth less than my 980ti; C: When...
5390,0,"You claim you support gun ownership, and reaso..."
860,1,"P: Don't vote.; C: No, obviously you have to u..."


In [8]:
train_fl = Dataset.from_pandas(train_fl, preserve_index=False)
val_fl = Dataset.from_pandas(val_fl, preserve_index=False)

In [9]:
train_fl[0]

{'label': 0,
 'text': 'Civilian Gun Control is a distraction from the real issue.'}

In [10]:
val_fl

Dataset({
    features: ['label', 'text'],
    num_rows: 2000
})

In [11]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], add_special_tokens=True, padding='max_length', max_length=128, truncation=True, return_attention_mask=True)

# Encode the input data
train_fl = train_fl.map(encode_batch, batched=True)
val_fl = val_fl.map(encode_batch, batched=True)

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [13]:
train_fl

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 18000
})

In [14]:
val_fl

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [15]:
train_fl = train_fl.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
train_fl.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

val_fl = val_fl.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
val_fl.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [16]:
train_fl

Dataset({
    features: ['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 18000
})

In [17]:
val_fl

Dataset({
    features: ['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [18]:
from transformers import RobertaConfig, RobertaModelWithHeads

config = RobertaConfig.from_pretrained(
    "vinai/bertweet-base",
    num_labels=3,
)
model = RobertaModelWithHeads.from_pretrained(
    "vinai/bertweet-base",
    config=config,
)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# Add a new adapter
model.add_adapter("fladapter")
# Add a matching classification head
model.add_classification_head(
    "fladapter",
    num_labels=3,
    id2label={ 0: "none", 1: "sarcasm", 2:"metaphor"}
  )
# Activate the adapter
model.train_adapter("fladapter")

In [20]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_fl,
    eval_dataset=val_fl,
    compute_metrics=compute_accuracy,
)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 18000
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3378
  Number of trainable parameters = 1487427


Step,Training Loss
200,0.7058
400,0.5973
600,0.5577
800,0.5395
1000,0.5349
1200,0.5105
1400,0.5083
1600,0.498
1800,0.4921
2000,0.4833


Saving model checkpoint to ./training_output/checkpoint-500
Configuration saved in ./training_output/checkpoint-500/fladapter/adapter_config.json
Module weights saved in ./training_output/checkpoint-500/fladapter/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-500/fladapter/head_config.json
Module weights saved in ./training_output/checkpoint-500/fladapter/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-500/fladapter/head_config.json
Module weights saved in ./training_output/checkpoint-500/fladapter/pytorch_model_head.bin
Saving model checkpoint to ./training_output/checkpoint-1000
Configuration saved in ./training_output/checkpoint-1000/fladapter/adapter_config.json
Module weights saved in ./training_output/checkpoint-1000/fladapter/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-1000/fladapter/head_config.json
Module weights saved in ./training_output/checkpoint-1000/fladapter/pytorch_model_head.bin
Configurat

TrainOutput(global_step=3378, training_loss=0.5126851286811897, metrics={'train_runtime': 625.046, 'train_samples_per_second': 172.787, 'train_steps_per_second': 5.404, 'total_flos': 7227244071936000.0, 'train_loss': 0.5126851286811897, 'epoch': 6.0})

In [22]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


{'eval_loss': 0.4898378849029541,
 'eval_acc': 0.768,
 'eval_runtime': 5.6832,
 'eval_samples_per_second': 351.911,
 'eval_steps_per_second': 11.085,
 'epoch': 6.0}

In [23]:
test_fl = Dataset.from_pandas(test_fl, preserve_index=False)

In [24]:
test_fl

Dataset({
    features: ['label', 'text'],
    num_rows: 18210
})

In [25]:
test_fl = test_fl.map(encode_batch, batched=True)

Map:   0%|          | 0/18210 [00:00<?, ? examples/s]

In [27]:
test_fl = test_fl.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
test_fl.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [28]:
test_fl

Dataset({
    features: ['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 18210
})

In [29]:
res = trainer.predict(test_fl)

***** Running Prediction *****
  Num examples = 18210
  Batch size = 32


In [30]:
res

PredictionOutput(predictions=array([[ 2.1516526 ,  3.1810842 , -6.1749334 ],
       [ 3.612661  , -2.083609  , -1.1310233 ],
       [ 3.5071766 , -3.184903  ,  0.616519  ],
       ...,
       [ 3.880916  , -2.956007  , -0.12211605],
       [ 3.1370227 , -3.9134932 ,  1.7422873 ],
       [ 0.8642302 ,  4.038598  , -6.1159067 ]], dtype=float32), label_ids=array([1, 0, 0, ..., 0, 0, 1]), metrics={'test_loss': 0.5104750990867615, 'test_acc': 0.7604612850082373, 'test_runtime': 47.9079, 'test_samples_per_second': 380.104, 'test_steps_per_second': 11.898})