In [1]:
import pandas as pd
import numpy as np
import datetime
import re
import string
import contractions
import nltk
from nltk.corpus import stopwords
import torch
import transformers
from transformers import AdamW, ElectraConfig, ElectraTokenizer, ElectraForSequenceClassification, ElectraModel, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import pytorch_lightning as pl
from torch.utils.data import TensorDataset, DataLoader, Dataset
import evaluate
from torch.optim import AdamW



  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# model = ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator", num_labels = 2)
tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
#configuration = ElectraConfig()
#model = ElectraForSequenceClassification(configuration)

In [20]:
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines = True)
df_train = df.drop(columns = ['article_link'])
col_types = {'headline':'str', 'is_sarcastic':'int32'}
df_train = df_train.astype(col_types)

In [35]:
train_size = int(0.8 * len(df_train))
val_size = int(0.1 * len(df_train))
test_size = len(df_train) - train_size - val_size

# Split the DataFrame into training and validation sets
train_df = df_train[:train_size]
val_df = df_train[train_size:train_size + val_size]
test_df = df_train[train_size + val_size:]

In [48]:
class SarcasmDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]['headline']
        labels = self.data[idx]['is_sarcastic']
        encodings = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt')
        return encodings['input_ids'][0], encodings['attention_mask'][0], torch.tensor(labels)


In [71]:
x = "I am a spastic"

encodings = tokenizer(x, max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=True, return_tensors='pt')

In [None]:
tokenizer = ElectraTokenizer.

In [49]:
train_dataset = SarcasmDataset(train_df.to_dict('records'), tokenizer)
val_dataset = SarcasmDataset(val_df.to_dict('records'), tokenizer)
test_dataset = SarcasmDataset(test_df.to_dict('records'), tokenizer)

In [78]:
train_dataset.data

(       is_sarcastic                                           headline
 0                 1  thirtysomething scientists unveil doomsday clo...
 1                 0  dem rep. totally nails why congress is falling...
 2                 0  eat your veggies: 9 deliciously different recipes
 3                 1  inclement weather prevents liar from getting t...
 4                 1  mother comes pretty close to using word 'strea...
 ...             ...                                                ...
 22890             0  'the wiz live!' brings the best of black excel...
 22891             1            madcap romp escalates into zany hijinks
 22892             0  some truly bizarre anti-gay arguments before t...
 22893             0  hotels think you want this bill. think again, ...
 22894             0  people show their love for the epa with thousa...
 
 [22895 rows x 2 columns],)

In [56]:

# train_encodings = tokenizer.batch_encode_plus(list(train_df['headline']), max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt')
# val_encodings = tokenizer.batch_encode_plus(list(val_df['headline']), max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt')
# test_encodings = tokenizer.batch_encode_plus(list(test_df['headline']), max_length=512, padding=True, truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt')

# train_input_ids = torch.tensor(train_encodings['input_ids'])
# train_attention_masks = torch.tensor(train_encodings['attention_mask'])
# val_input_ids = torch.tensor(val_encodings['input_ids'])
# val_attention_masks = torch.tensor(val_encodings['attention_mask'])
# test_input_ids = torch.tensor(test_encodings['input_ids'])
# test_attention_masks = torch.tensor(test_encodings['attention_mask'])

# train_labels = torch.tensor(train_df['is_sarcastic'].values)
# val_labels = torch.tensor(val_df['is_sarcastic'].values)
# test_labels = torch.tensor(test_df['is_sarcastic'].values)
     


  train_input_ids = torch.tensor(train_encodings['input_ids'])
  train_attention_masks = torch.tensor(train_encodings['attention_mask'])
  val_input_ids = torch.tensor(val_encodings['input_ids'])
  val_attention_masks = torch.tensor(val_encodings['attention_mask'])
  test_input_ids = torch.tensor(test_encodings['input_ids'])
  test_attention_masks = torch.tensor(test_encodings['attention_mask'])


In [50]:


class ElectraClassifier(pl.LightningModule):
    def __init__(self, model_name="google/electra-small-discriminator", num_labels=2, learning_rate=2e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

        for param in self.model.electra.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        self.log("val_loss", loss)

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        self.log("test_loss", loss)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=0.0005)
        return optimizer

In [51]:
from pytorch_lightning import Trainer

model = ElectraClassifier()
trainer = Trainer(
    max_epochs=10
)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [56]:

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=6)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=6)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)
     


In [57]:
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                             | Params
-----------------------------------------------------------
0 | model | ElectraForSequenceClassification | 13.5 M
-----------------------------------------------------------
66.3 K    Trainable params
13.5 M    Non-trainable params
13.5 M    Total params
54.197    Total estimated model params size (MB)


Epoch 0:  20%|██        | 292/1431 [00:24<01:37, 11.73it/s, v_num=5]
Epoch 0:  74%|███████▍  | 1058/1431 [00:48<00:17, 21.64it/s, v_num=5]      

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [82]:
#training arguments

training_args = TrainingArguments(
    output_dir='.',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='steps',
    eval_steps=500,
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
#evaluation

predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)

In [None]:
metric = evaluate.load("f1", "accuracy", "precision")
results = metric.compute(predictions=preds, references=predictions.label_ids)

In [83]:
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy", "precision", "f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions = predictions, references = labels)

In [71]:


trainer = Trainer(
    model, 
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [72]:
trainer.train()



AttributeError: 'list' object has no attribute 'keys'

In [None]:
#evaluation

predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)

In [None]:
metric = evaluate.load("f1", "accuracy", "precision")
results = metric.compute(predictions=preds, references=predictions.label_ids)
