In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import transformers
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup, BertTokenizer, BertModel
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup
from torch.nn.utils import clip_grad_norm
import gc
from transformers.adapters import AutoAdapterModel, RobertaAdapterModel
from transformers import RobertaTokenizer
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

In [14]:
train_fl = pd.read_csv("sar_and_meta_train.csv")[:20000]
test_fl = pd.read_csv("sar_and_meta_test.csv")

In [15]:
val_size=0.1
seed=42

In [16]:
train_fl, val_fl = train_test_split(train_fl, test_size=val_size, random_state=seed)

In [17]:
train_fl = Dataset.from_pandas(train_fl, preserve_index=False)
val_fl = Dataset.from_pandas(val_fl, preserve_index=False)

In [18]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/xl2473/.cache/huggingface/hub/models--vinai--bertweet-base/snapshots/118ab1d567653bec16bbb081eafb6f8942f72108/config.json
Model config RobertaConfig {
  "_name_or_path": "vinai/bertweet-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertweetTokenizer",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": t

In [19]:
def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], add_special_tokens=True, padding='max_length', max_length=128, truncation=True, return_attention_mask=True)

# Encode the input data
train_fl = train_fl.map(encode_batch, batched=True)
val_fl = val_fl.map(encode_batch, batched=True)

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [20]:
train_fl = train_fl.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
train_fl.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

val_fl = val_fl.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
val_fl.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [34]:
model_p = torch.load("checkpoints/epoch4@sid32726483.pt")

In [35]:
from transformers import RobertaConfig, RobertaModelWithHeads

config = RobertaConfig.from_pretrained(
    "vinai/bertweet-base",
    num_labels=3,
)
model = RobertaModelWithHeads.from_pretrained(
    "vinai/bertweet-base",
    config=config,
)

loading configuration file config.json from cache at /home/xl2473/.cache/huggingface/hub/models--vinai--bertweet-base/snapshots/118ab1d567653bec16bbb081eafb6f8942f72108/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertweetTokenizer",
  "transformers_version": "4.26.1",
  "type_vocab_size": 

In [36]:
model.roberta.load_state_dict(model_p)

<All keys matched successfully>

In [37]:
# Add a new adapter
model.add_adapter("fladapter")
# Add a matching classification head
model.add_classification_head(
    "fladapter",
    num_labels=3,
    id2label={ 0: "none", 1: "sarcasm", 2:"metaphor"}
  )
# Activate the adapter
model.train_adapter("fladapter")

Adding adapter 'fladapter'.
Adding head 'fladapter' with config {'head_type': 'classification', 'num_labels': 3, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'none': 0, 'sarcasm': 1, 'metaphor': 2}, 'use_pooler': False, 'bias': True}.


In [38]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_fl,
    eval_dataset=val_fl,
    compute_metrics=compute_accuracy,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [39]:
trainer.train()

***** Running training *****
  Num examples = 18000
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3378
  Number of trainable parameters = 1487427


Step,Training Loss
200,0.7783
400,0.7731
600,0.7638
800,0.7641
1000,0.7695
1200,0.7708
1400,0.7675
1600,0.7612
1800,0.7632
2000,0.768


Saving model checkpoint to ./training_output/checkpoint-500
Configuration saved in ./training_output/checkpoint-500/fladapter/adapter_config.json
Module weights saved in ./training_output/checkpoint-500/fladapter/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-500/fladapter/head_config.json
Module weights saved in ./training_output/checkpoint-500/fladapter/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-500/fladapter/head_config.json
Module weights saved in ./training_output/checkpoint-500/fladapter/pytorch_model_head.bin
Saving model checkpoint to ./training_output/checkpoint-1000
Configuration saved in ./training_output/checkpoint-1000/fladapter/adapter_config.json
Module weights saved in ./training_output/checkpoint-1000/fladapter/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-1000/fladapter/head_config.json
Module weights saved in ./training_output/checkpoint-1000/fladapter/pytorch_model_head.bin
Configurat

TrainOutput(global_step=3378, training_loss=0.7662090755482973, metrics={'train_runtime': 623.2928, 'train_samples_per_second': 173.273, 'train_steps_per_second': 5.42, 'total_flos': 7227244071936000.0, 'train_loss': 0.7662090755482973, 'epoch': 6.0})

In [40]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


{'eval_loss': 0.7545523643493652,
 'eval_acc': 0.566,
 'eval_runtime': 5.3778,
 'eval_samples_per_second': 371.897,
 'eval_steps_per_second': 11.715,
 'epoch': 6.0}

In [41]:
test_fl = Dataset.from_pandas(test_fl, preserve_index=False)

In [42]:
test_fl = test_fl.map(encode_batch, batched=True)
test_fl = test_fl.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
test_fl.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/18210 [00:00<?, ? examples/s]

In [43]:
res = trainer.predict(test_fl)
res

***** Running Prediction *****
  Num examples = 18210
  Batch size = 32


PredictionOutput(predictions=array([[ 1.2435882 ,  0.9313903 , -2.2752206 ],
       [ 1.2435883 ,  0.9313903 , -2.2752206 ],
       [ 1.2435883 ,  0.9313903 , -2.2752204 ],
       ...,
       [ 1.2435883 ,  0.9313903 , -2.2752204 ],
       [ 1.2435883 ,  0.9313903 , -2.2752204 ],
       [ 1.2435883 ,  0.93139017, -2.2752204 ]], dtype=float32), label_ids=array([1, 0, 0, ..., 0, 0, 1]), metrics={'test_loss': 0.770438015460968, 'test_acc': 0.5616694124107633, 'test_runtime': 49.1647, 'test_samples_per_second': 370.388, 'test_steps_per_second': 11.594})