In [1]:
import pandas as pd
import numpy as np
import datetime
import re
import string
import contractions
import nltk
from nltk.corpus import stopwords
import torch
import transformers
from transformers import AdamW, ElectraConfig, ElectraTokenizer, ElectraForSequenceClassification, ElectraModel, AutoTokenizer, TrainingArguments, DataCollatorWithPadding, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from torch.utils.data import TensorDataset, DataLoader, Dataset, random_split
import evaluate
from torch.optim import AdamW
from finetuning_scheduler import FinetuningScheduler 
from pytorch_lightning import Trainer, Callback
from pytorch_lightning.callbacks import LearningRateMonitor, ModelSummary, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import torch.nn.functional as F
import torchmetrics
from torchmetrics import Accuracy, Precision, Recall
from torchmetrics.classification import BinaryF1Score
import tensorboard
import matplotlib.pyplot as plt
import os
from torch.optim.lr_scheduler import OneCycleLR, LambdaLR
import json
from sklearn.model_selection import train_test_split
import math


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext tensorboard



In [20]:
%reload_ext tensorboard

In [4]:
rm -rf ./tb_logs/

In [3]:
data_path = "/workspaces/sarcasm_detection/notebooks/project_data/Sarcasm_Headlines_Dataset_v2.json"
#tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
# model = ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator", num_labels = 2)
#configuration = ElectraConfig()
#model = ElectraForSequenceClassification(configuration)
version_number = 1
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
checkpoint_path = f"sarcasm_detection_finetune_ckpt_v{version_number}_{current_time}.ckpt"

In [4]:
#data module
class SarcasmDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
            self.data = data
            self.tokenizer = tokenizer
            self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]['headline']
        labels = self.data[idx]['is_sarcastic']
        encodings = self.tokenizer(
              text, 
              max_length=self.max_length, 
              padding='max_length', 
              truncation=True, 
              return_attention_mask=True, 
              return_token_type_ids=False, 
              return_tensors='pt'
              )
        return encodings['input_ids'].flatten(), encodings['attention_mask'].flatten(), torch.tensor(labels)
    
class SarcasmDataModule(pl.LightningDataModule):
    def __init__(self, data_path, batch_size, tokenizer="google/electra-small-discriminator"):
        super().__init__()
        self.data_path = data_path
        self.tokenizer = ElectraTokenizer.from_pretrained(tokenizer)
        self.batch_size=batch_size

        self.setup()

    
    def prepare_data(self):
        col_types = {'headline':'str', 'is_sarcastic':'int32'}

        df = (
             pd.read_json(self.data_path, lines=True)
             .drop(columns=['article_link'])       
             .astype(col_types)
        )

        train_df, val_df, test_df = self.split_datasets(df)
        # print(f"training df length: {len(train_df)}")
        # print(f"Validation DataFrame length: {len(val_df)}")
        # print(f"Test DataFrame length: {len(test_df)}")
        # print(f"total df len: {len(train_df+val_df+test_df)}")
        # print(len(df))
        
        self.data_train = train_df.to_dict('records')
        self.data_val = val_df.to_dict('records')
        self.data_test = test_df.to_dict('records')
        
    def setup(self, stage: str=None):

        if stage == "fit":
            self.train_dataset = SarcasmDataset(self.data_train, self.tokenizer)
            self.val_dataset = SarcasmDataset(self.data_val, self.tokenizer)
        
        if stage == "test":
            self.test_dataset = SarcasmDataset(self.data_test, self.tokenizer)
        
        if stage == "predict":
            self.predict_dataset = SarcasmDataset(self.data_test, self.tokenizer)

    def steps_per_epoch(self):
        return len(self.train_dataset)
    
    def split_datasets(self, df):
        train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
        val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)
        return train_df, val_df, test_df

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=6, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=6, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=6, shuffle=False)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_dataset, batch_size=self.batch_size, num_workers=6, shuffle=False, collate_fn=self.collate_fn)


In [5]:


class ElectraClassifier(pl.LightningModule):
    def __init__(self, model_name="google/electra-small-discriminator", num_labels=2, learning_rate=2e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.warmup_steps = 10000
        self.finetuning_scheduler = FinetuningScheduler


        #metrics
        self.train_accuracy = Accuracy(task='binary', num_classes=num_labels)
        self.val_accuracy = Accuracy(task='binary', num_classes=num_labels)
        self.train_precision = Precision(task='binary', num_classes=num_labels, average = 'weighted')
        self.val_precision = Precision(task='binary', num_classes=num_labels, average = 'weighted')
        self.train_recall = Recall(task='binary', num_classes=num_labels, average = 'weighted')
        self.val_recall = Recall(task='binary', num_classes=num_labels, average = 'weighted')
        self.val_f1_score = BinaryF1Score(task='binary', num_classes=num_labels)
        self.f1 = BinaryF1Score(task='binary', num_classes=num_labels, average = 'macro')

        self.finetuning_scheduler.freeze(self.model.electra)

        #for adding smaller networks on top
        #self.dropout = nn.Dropout(0.1)
        #self.fc1 = nn.Linear(self.electra.config.hidden_size, 128)
        #self.fc2 = nn.Linear(128, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
        #for adding smaller networks on top
        #outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        #last_hidden_state = outputs.last_hidden_state
        #cls_token = last_hidden_state[:, 0]
        #x = self.dropout(cls_token)
        #x = self.fc1(x)
        #x = torch.relu(x)
        #x = self.fc2(x)
        #return x
    
    def on_train_batch_start(self, batch, batch_idx):
        if self.global_step == self.warmup_steps:
            #unfreeze base layers
            for param in self.model.electra.parameters():
                param.requires_grad = True

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        preds = outputs.logits.argmax(dim=-1)

        #logging
        acc = self.train_accuracy(preds, labels)
        prec = self.train_precision(preds, labels)
        rec = self.train_recall(preds, labels)
        self.log("train_loss", loss, on_step = True, on_epoch = True, prog_bar = True)
        self.log("train_accuracy", acc, on_step = True, on_epoch = True, prog_bar=True)
        self.log("train_precision", prec, on_step = True, on_epoch = True, prog_bar=True)
        self.log("train_recall", rec, on_step = True, on_epoch = True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        preds = outputs.logits.argmax(dim=-1)

        #logging
        acc = self.val_accuracy(preds, labels)
        prec = self.val_precision(preds, labels)
        rec = self.val_recall(preds, labels)
        self.log("val_loss", loss)
        self.log("val_accuracy", acc)
        self.log("val_precision", prec)
        self.log("val_recall", rec)

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        preds = outputs.logits.argmax(dim=-1)
        f1_score = self.f1(preds, labels)
        self.log('test_f1', f1_score, on_step=True, on_epoch=True, prog_bar=True)
    
    def predict_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        preds = outputs.logits.argmax(dim=-1)
        return preds
    
    def collate_fn(self, batch):
        inputs, labels = zip(*batch)
        return torch.stack(inputs), torch.stack(labels)

    @property
    def lr(self):
        return self.optimizers().param_groups[0]['lr']

    def configure_optimizers(self):
        # num_epochs = self.trainer.max_epochs

        # if self.trainer.state.stage == 'fit':
        #     steps_per_epoch = len(self.train_dataloader()) // self.hparams.batch_size
        
        # else:
        #     steps_per_epoch = 1

        # warmup_steps = 10000
        
        # total_steps = num_epochs * steps_per_epoch

        optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate)
        scheduler_config = {
        'scheduler': FinetuningScheduler(
            optimizer,
            milestones=[
                (0, 0.00002),
                (10000, 0.0002),
                (20000, 0.002)
            ],
        ),
        'name': 'learning_rate',
        'interval': 'step',
        'frequency': 1
    }
        return [optimizer],[scheduler_config]

In [6]:
#metrics plotting

class MetricsCallback(Callback):
    def __init__(self):
        super().__init__()
        self.train_metrics = []
        self.val_metrics = []
        self.batch_train_metrics = []
    
    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        self.batch_train_metrics.append(trainer.callback_metrics)

    def on_train_epoch_end(self, trainer, pl_module):
        epoch_train_metrics = {}
        for key in self.batch_train_metrics[0].keys():
            epoch_train_metrics[key] = torch.stack([x[key] for x in self.batch_train_metrics]).mean()
        self.train_metrics.append(epoch_train_metrics)
        self.batch_train_metrics = []

    def on_validation_epoch_end(self, trainer, pl_module):
        self.val_metrics.append(trainer.callback_metrics)

In [7]:
model = ElectraClassifier()
data_module = SarcasmDataModule(data_path=data_path, batch_size=16)
lr_monitor = LearningRateMonitor(logging_interval = 'step')
logger = TensorBoardLogger("tb_logs", name="electra_model_v1")
metrics_callback = MetricsCallback()
early_stopping = EarlyStopping('val_loss', patience=10, verbose=True)

trainer = Trainer(
    max_epochs=10000,
    callbacks=[lr_monitor, metrics_callback, early_stopping],
    logger=logger
)



Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [8]:


trainer.fit(model, data_module)

trainer.save_checkpoint(checkpoint_path)

Missing logger folder: tb_logs/electra_model_v1
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


TypeError: __init__() got an unexpected keyword argument 'milestones'

In [29]:
%tensorboard --logdir=tb_logs/

In [None]:
result = trainer.test(model, data_module)

In [29]:
trainer.predict(model, data_module)

  rank_zero_warn(
Restoring states from the checkpoint path at tb_logs/electra_model_v0/version_1/checkpoints/epoch=45-step=65826.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at tb_logs/electra_model_v0/version_1/checkpoints/epoch=45-step=65826.ckpt


Predicting DataLoader 0:   0%|          | 0/179 [00:00<?, ?it/s]

TypeError: forward() missing 1 required positional argument: 'attention_mask'

In [19]:
#find most recent version of model checkpoint

def find_latest_checkpoint(version_prefix = "sarcasm_detection_finetune_ckpt_v"):
    checkpoints = [file for file in os.listdir() if file.startswith(version_prefix) and file.endswith(".ckpt")]
    return max(checkpoints, key=os.path.getctime) if checkpoints else None

In [20]:
latest_checkpoint = find_latest_checkpoint()

if latest_checkpoint:

    #load model
    loaded_model = ElectraClassifier.load_from_checkpoint(latest_checkpoint)

    #new model with different classification heads
    transfer_model = ElectraClassifier(num_labels=6)

    #load fine_tuned model weights keeping new classification head
    transfer_model.electra.load_state_dict(loaded_model.electra.state_dict())

else:
    print("No model checkpoint found")

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

AttributeError: 'ElectraClassifier' object has no attribute 'electra'

In [31]:
model.eval()

ElectraClassifier(
  (model): ElectraForSequenceClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(30522, 128, padding_idx=0)
        (position_embeddings): Embedding(512, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0-11): 12 x ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=256, out_features=256, bias=True)
                (key): Linear(in_features=256, out_features=256, bias=True)
                (value): Linear(in_features=256, out_features=256, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              

In [None]:
# Create a new model with a different number of classes
transfer_model = ElectraClassifier(num_labels=3)

# Load the fine-tuned model weights, but keep the new classification head
transfer_model.model.electra.load_state_dict(loaded_model.model.electra.state_dict())

In [82]:
#training arguments

training_args = TrainingArguments(
    output_dir='.',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='steps',
    eval_steps=500,
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
#evaluation

predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)

In [None]:
metric = evaluate.load("f1", "accuracy", "precision")
results = metric.compute(predictions=preds, references=predictions.label_ids)

In [83]:
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy", "precision", "f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions = predictions, references = labels)

In [71]:


trainer = Trainer(
    model, 
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
#evaluation

predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)

In [None]:
metric = evaluate.load("f1", "accuracy", "precision")
results = metric.compute(predictions=preds, references=predictions.label_ids)


In [None]:

# train_encodings = tokenizer.batch_encode_plus(list(train_df['headline']), max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt')
# val_encodings = tokenizer.batch_encode_plus(list(val_df['headline']), max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt')
# test_encodings = tokenizer.batch_encode_plus(list(test_df['headline']), max_length=512, padding=True, truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt')

# train_input_ids = torch.tensor(train_encodings['input_ids'])
# train_attention_masks = torch.tensor(train_encodings['attention_mask'])
# val_input_ids = torch.tensor(val_encodings['input_ids'])
# val_attention_masks = torch.tensor(val_encodings['attention_mask'])
# test_input_ids = torch.tensor(test_encodings['input_ids'])
# test_attention_masks = torch.tensor(test_encodings['attention_mask'])

# train_labels = torch.tensor(train_df['is_sarcastic'].values)
# val_labels = torch.tensor(val_df['is_sarcastic'].values)
# test_labels = torch.tensor(test_df['is_sarcastic'].values)
     
