In [6]:
import pandas as pd
import numpy as np
import os
import time
import copy
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.optim import AdamW
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [7]:
data = pd.read_csv('/content/final_preprocessed_data.csv')
data.head(1)

Unnamed: 0,headline,is_sarcastic,entity,clean_headline,sequence_length,contains_number
0,former versace store clerk sues over secret 'b...,0,,former versace store clerk sue secret black co...,10,False


In [8]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(data['clean_headline'].values, data['is_sarcastic'].values, test_size=0.3, random_state=42, stratify=data['is_sarcastic'])
val_texts, test_text, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels)

print(f'Train size: {len(train_texts)}')
print(f'Validation size: {len(val_texts)}')
print(f'Test size: {len(test_text)}')

Train size: 38728
Validation size: 8299
Test size: 8300


In [9]:
class CustomEncoderModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim=256, num_heads=8, num_layers=3,
               hidden_dim=512, num_classes=2, max_length=128, dropout=0.1):
    super(CustomEncoderModel, self).__init__()
    self.embed_dim = embedding_dim
    self.max_length = max_length

    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
    self.pos_embedding = nn.Embedding(max_length, embedding_dim)

    encoder_layer = nn.TransformerEncoderLayer(
        d_model=embedding_dim,
        nhead=num_heads,
        dim_feedforward=hidden_dim,
        dropout=dropout,
        batch_first=True
    )
    self.transformer_encoder = nn.TransformerEncoder(
        encoder_layer,
        num_layers=num_layers
    )
    self.dropout = nn.Dropout(dropout)
    self.fc1 = nn.Linear(embedding_dim, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, num_classes)

  def forward(self, input_ids, attention_mask=None):
    batch_size, seq_len = input_ids.size()
    positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
    x = self.embedding(input_ids)
    pos_x = self.pos_embedding(positions)
    x = x + pos_x

    if attention_mask is not None:
      padding_mask = (attention_mask == 0)
    else:
      padding_mask = None

    x = self.transformer_encoder(x, src_key_padding_mask=padding_mask)

    if attention_mask is not None:
      mask_expanded = attention_mask.unsqueeze(-1).expand(x.size()).float()
      x = (x * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1).clamp(min=1e-9)
    else:
      x = x.mean(dim=1)

    x = self.dropout(x)
    x = F.relu(self.fc1(x))
    x = self.dropout(x)
    logits = self.fc2(x)

    return logits

In [10]:
class CustomTokenizer:
    def __init__(self, vocab, unk_token='<UNK>', pad_token='<PAD>'):
        self.vocab = vocab
        self.unk_token = unk_token
        self.pad_token = pad_token
        self.word2idx = {word: idx for idx, word in enumerate(vocab)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

    def encode(self, text, max_length=128):
        words = text.lower().split()
        unk_index = self.word2idx.get(self.unk_token)
        indices = [self.word2idx.get(word, unk_index) for word in words]

        if len(indices) > max_length:
            indices = indices[:max_length]
        else:
            pad_index = self.word2idx.get(self.pad_token)
            indices += [pad_index] * (max_length - len(indices))

        return torch.tensor(indices, dtype=torch.long)

    def get_attention_mask(self, input_ids):
        pad_index = self.word2idx.get(self.pad_token)
        return (input_ids != pad_index).long()


def build_vocab(texts, min_freq=2, max_vocab_size=1000):
  word_counts = Counter()
  for text in texts:
    words = str(text).lower().split()
    word_counts.update(words)


  vocab = ['<PAD>', '<UNK>']
  for word, count in word_counts.most_common(max_vocab_size - len(vocab)):
        if count >= min_freq:
            vocab.append(word)

  print(f"Vocabulary size: {len(vocab)}")
  print(f"Total unique words: {len(word_counts)}")
  return vocab

In [11]:
class TextDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = self.labels[idx]

    # Call the custom tokenizer's encode method with only the arguments it expects
    input_ids = self.tokenizer.encode(
        text,
        max_length=self.max_length
    )
    attention_mask = self.tokenizer.get_attention_mask(input_ids)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': torch.tensor(label, dtype=torch.long)
    }

In [12]:
def train_custom_encoder(train_texts, train_labels, val_texts, val_labels, save_path,
                   epochs=3, batch_size=16, learning_rate=2e-4, max_length=128):
  print(f"\n{'-'*60}")
  print("Training Custom Encoder Model")
  print(f"{'-'*60}")

  model_name = 'Custom Model'


  print('Building vocabulary...')
  vocab = build_vocab(train_texts, min_freq=2, max_vocab_size=10000)
  tokenizer = CustomTokenizer(vocab)

  model = CustomEncoderModel(vocab_size=len(vocab),
                             embedding_dim=256,
                             num_heads=8,
                             num_layers=3,
                             hidden_dim=512,
                             num_classes=2,
                             max_length=max_length,
                             dropout=0.1
  )

  # Print the number of trainable parameters after applying LoRA
  tolal_params = sum(p.numel() for p in model.parameters())
  trainalble_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f'Total parameters: {tolal_params:,}')
  print(f'Trainable parameters: {trainalble_params:,}')


  model.to(device)

  train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length)
  val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_length)

  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  Val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  criterion = nn.CrossEntropyLoss()
  optimizer = AdamW(model.parameters(), lr=learning_rate)
  total_steps = len(train_loader) * epochs
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
      optimizer, mode='min', factor=0.5, patience=2
  )

  best_model_wts = copy.deepcopy(model.state_dict())
  best_loss = float('inf')

  # Train loop
  train_losses = []
  val_accuracies = []
  training_times = []

  for epoch in range(epochs):
    model.train()
    train_total_loss = 0.0
    epoch_start = time.time()

    for batch in train_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      optimizer.zero_grad()
      outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
      )
      loss = criterion(outputs, labels)
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      # The scheduler.step() method was called incorrectly inside the batch loop, which is only expected at the end of an epoch.
      # For AdamW, a learning rate scheduler should typically be stepped once per epoch or after a certain number of steps, not every batch in this specific setup.
      # Removing this line as it was causing issues and likely not intended for this scheduler type in a per-batch context.
      optimizer.step()

      train_total_loss += loss.item()

    epoch_time = time.time() - epoch_start
    training_times.append(epoch_time)

    avg_loss = train_total_loss / len(train_loader)
    train_losses.append(avg_loss)

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    val_loss = []
    val_total_loss = 0.0

    with torch.no_grad():
      for batch in Val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        loss = criterion(outputs, labels)
        # The model's forward method should directly return logits, not an object with a 'logits' attribute.
        # Removing '.logits' from the predictions line.
        predictions = torch.argmax(outputs, dim=1)
        val_predictions.extend(predictions.cpu().numpy().tolist())
        val_true_labels.extend(labels.cpu().numpy().tolist())
        val_total_loss += loss.item()

    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    val_accuracies.append(val_accuracy)
    avg_losses = val_total_loss / len(Val_loader)
    val_loss.append(avg_losses)

    scheduler.step(avg_losses)

    if avg_losses < best_loss:
      best_loss = avg_losses
      best_model_wts = copy.deepcopy(model.state_dict())
      print('Updating best model weights')
      parent = os.path.dirname(save_path)
      if parent and not os.path.exists(parent):
        os.makedirs(parent, exist_ok=True)
      # The model and tokenizer were saved outside the checkpoint in the if condition, but the actual checkpoint was saved later.
      # This was a redundant save and could lead to issues if the later checkpoint was not saved.
      # Removing this redundant save to avoid confusion and ensure only the final checkpoint is the source of truth.
      # torch.save({
      #     'model_state_dict': model.state_dict(),
      #     'vocab': vocab,
      #     'tokenizer': tokenizer
      # }, save_path)

    print(f'Epoch {epoch+1}/{epochs} - loss: {avg_loss:.4f}, Val_loss: {avg_losses:.4f}, Val_accuracy: {val_accuracies[-1]:.4f}, time: {epoch_time:.2f}s')

  total_training_time = sum(training_times)
  print(f'Total trining time: {total_training_time:.2f}s')

  model.load_state_dict(best_model_wts)
  model.to(device)
  model.eval()

  checkpoint = {
    "model_state_dict": best_model_wts,
    "model_name": model_name,
    "train_losses": train_losses,
    "val_accuracies": val_accuracies,
    "val_losses": val_loss,
    "total_training_time": total_training_time,
    "vocab": vocab, # Add vocab to the checkpoint
    "tokenizer": tokenizer # Add tokenizer to the checkpoint
  }

  torch.save(checkpoint, save_path)
  print(f"Checkpoint saved to: {save_path}")

  return model, tokenizer, train_losses, val_accuracies, total_training_time

In [13]:
# Train Custom Encoder Model
custom_save_path = 'custom_encoder_model.pth'
custom_model, custom_tokenizer, custom_train_losses, custom_val_accuracies, custom_training_time = train_custom_encoder(
    train_texts, train_labels, val_texts, val_labels,
    save_path=custom_save_path, epochs=50, batch_size=16, learning_rate=1e-3
)


------------------------------------------------------------
Training Custom Encoder Model
------------------------------------------------------------
Building vocabulary...
Vocabulary size: 10000
Total unique words: 20351
Total parameters: 4,306,690
Trainable parameters: 4,306,690


  output = torch._nested_tensor_from_mask(


Updating best model weights
Epoch 1/50 - loss: 0.6845, Val_loss: 0.6897, Val_accuracy: 0.5510, time: 34.52s
Epoch 2/50 - loss: 0.6895, Val_loss: 0.6897, Val_accuracy: 0.5416, time: 34.01s
Epoch 3/50 - loss: 0.6898, Val_loss: 0.6897, Val_accuracy: 0.5416, time: 34.22s
Epoch 4/50 - loss: 0.6899, Val_loss: 0.6897, Val_accuracy: 0.5416, time: 34.58s
Updating best model weights
Epoch 5/50 - loss: 0.6897, Val_loss: 0.6897, Val_accuracy: 0.5416, time: 35.00s
Epoch 6/50 - loss: 0.6897, Val_loss: 0.6906, Val_accuracy: 0.5416, time: 35.54s
Updating best model weights
Epoch 7/50 - loss: 0.6899, Val_loss: 0.6897, Val_accuracy: 0.5416, time: 35.23s
Epoch 8/50 - loss: 0.6897, Val_loss: 0.6897, Val_accuracy: 0.5416, time: 35.39s
Epoch 9/50 - loss: 0.6898, Val_loss: 0.6897, Val_accuracy: 0.5416, time: 35.31s
Epoch 10/50 - loss: 0.6897, Val_loss: 0.6898, Val_accuracy: 0.5416, time: 35.28s
Epoch 11/50 - loss: 0.6897, Val_loss: 0.6897, Val_accuracy: 0.5416, time: 35.36s
Epoch 12/50 - loss: 0.6897, Val_lo