In [30]:
import pandas as pd

# Load the dataset
file_path = "../datasets/combined.parquet"
data = pd.read_parquet(file_path)

# Display the first few rows of the dataset for a quick overview
data.head()

Unnamed: 0,sentence,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,1.0
1,dem rep. totally nails why congress is falling...,0.0
2,eat your veggies: 9 deliciously different recipes,0.0
3,inclement weather prevents liar from getting t...,1.0
4,mother comes pretty close to using word 'strea...,1.0


In [31]:
# Checking for any null values in the dataset
null_check = data.isnull().sum()

# Checking the distribution of the 'is_sarcastic' column
label_distribution = data["is_sarcastic"].value_counts(normalize=True)

null_check, label_distribution

(sentence        0
 is_sarcastic    0
 dtype: int64,
 is_sarcastic
 0.0    0.521391
 1.0    0.478609
 Name: proportion, dtype: float64)

In [32]:
import re
from sklearn.model_selection import train_test_split

# Data cleaning: removing special characters and escape sequences from the sentences
data["sentence"] = data["sentence"].apply(lambda x: re.sub(r"[\n\r\t]+", " ", x))

# Splitting the dataset into training, validation, and testing sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

# Showing the size of each split
train_size, val_size, test_size = len(train_data), len(val_data), len(test_data)
train_size, val_size, test_size

(28322, 6069, 6070)

In [33]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


class SarcasticSentencesDataset(Dataset):
    """
    A custom PyTorch Dataset for the sarcastic sentences dataset.
    """

    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        label = self.labels[item]

        # Encoding the sentences using the tokenizer
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",  # Return PyTorch tensors
            truncation=True,
        )

        return {
            "sentence": sentence,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }


# Constants
MAX_LEN = 128  # Maximum length of the tokens list
BATCH_SIZE = 16

# Creating instances of the SarcasticSentencesDataset
train_dataset = SarcasticSentencesDataset(
    train_data["sentence"].to_numpy(),
    train_data["is_sarcastic"].to_numpy(),
    tokenizer,
    MAX_LEN,
)

val_dataset = SarcasticSentencesDataset(
    val_data["sentence"].to_numpy(),
    val_data["is_sarcastic"].to_numpy(),
    tokenizer,
    MAX_LEN,
)

test_dataset = SarcasticSentencesDataset(
    test_data["sentence"].to_numpy(),
    test_data["is_sarcastic"].to_numpy(),
    tokenizer,
    MAX_LEN,
)

# Creating the DataLoaders for training, validation, and testing
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Checking the first batch from the train_loader
next(iter(train_loader))

{'sentence': ['girlfriend to stay underneath blanket for next 5 months',
  'is putting a plastic container in the microwave really that bad?',
  "jennifer lawrence may hate singing, but now she's a pop star",
  "kim cattrall's missing brother found dead at his home",
  'advisors hopeful jeb bush finally has momentum to end campaign',
  'Did you slip in it AGAIN? Were you led astray by a passage from Heller you thought you could use to legitimize gun control, your wet dream? I submit from many past rulings of the courts that little statement is in error.',
  'goodyear unveils new, circular tires',
  'Ah! So wellspoken for an addle-pated twiddle poop! I congratulate you! And a total lie! Who the Hades wants 13-year-old children to be sexually active? No responsible, thinking person could wish such a thing. You malign Planned Parenthood. You have ceased to be a credible commentator on this issue.',
  'senate bill may answer a decades-old request',
  "so you are into pommy bashing now? mae

In [34]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)
from torch.nn import CrossEntropyLoss
from tqdm import tqdm  # for displaying progress
import numpy as np

In [35]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [36]:
from typing import Dict
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.nn import Module, CrossEntropyLoss
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from tqdm import tqdm


def train_epoch(
    model: Module,
    data_loader: DataLoader,
    optimizer: Optimizer,
    device: torch.device,
    scheduler: _LRScheduler,
    loss_fn: CrossEntropyLoss,
    n_examples: int,
) -> Dict[str, float]:
    model.train()
    losses = []
    correct_predictions = 0

    # Initialize counters
    tp_sarcasm = 0
    tn_non_sarcasm = 0
    fp_sarcasm = 0
    fn_sarcasm = 0

    for batch in tqdm(data_loader, total=len(data_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

        # Update TP, TN, FP, FN counters
        tp_sarcasm += torch.sum((preds == 1) & (labels == 1)).item()
        tn_non_sarcasm += torch.sum((preds == 0) & (labels == 0)).item()
        fp_sarcasm += torch.sum((preds == 1) & (labels == 0)).item()
        fn_sarcasm += torch.sum((preds == 0) & (labels == 1)).item()

    # Compute precision and recall for sarcasm
    precision_sarcasm = tp_sarcasm / (tp_sarcasm + fp_sarcasm + 1e-10)
    recall_sarcasm = tp_sarcasm / (tp_sarcasm + fn_sarcasm + 1e-10)

    # Compute precision and recall for non-sarcasm
    precision_non_sarcasm = tn_non_sarcasm / (tn_non_sarcasm + fn_sarcasm + 1e-10)
    recall_non_sarcasm = tn_non_sarcasm / (tn_non_sarcasm + fp_sarcasm + 1e-10)

    return {
        "accuracy": correct_predictions.float() / n_examples,
        "precision_sarcasm": precision_sarcasm,
        "recall_sarcasm": recall_sarcasm,
        "precision_non_sarcasm": precision_non_sarcasm,
        "recall_non_sarcasm": recall_non_sarcasm,
        "loss": np.mean(losses),
    }

In [37]:
def eval_model(
    model: Module,
    data_loader: DataLoader,
    device: torch.device,
    loss_fn: CrossEntropyLoss,
    n_examples: int,
) -> Dict[str, float]:
    model.eval()
    losses = []
    correct_predictions = 0

    # Initialize counters
    tp_sarcasm = 0
    tn_non_sarcasm = 0
    fp_sarcasm = 0
    fn_sarcasm = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            losses.append(loss.item())

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

            # Update TP, TN, FP, FN counters
            tp_sarcasm += torch.sum((preds == 1) & (labels == 1)).item()
            tn_non_sarcasm += torch.sum((preds == 0) & (labels == 0)).item()
            fp_sarcasm += torch.sum((preds == 1) & (labels == 0)).item()
            fn_sarcasm += torch.sum((preds == 0) & (labels == 1)).item()

    # Compute precision and recall for sarcasm
    precision_sarcasm = tp_sarcasm / (tp_sarcasm + fp_sarcasm + 1e-10)
    recall_sarcasm = tp_sarcasm / (tp_sarcasm + fn_sarcasm + 1e-10)

    # Compute precision and recall for non-sarcasm
    precision_non_sarcasm = tn_non_sarcasm / (tn_non_sarcasm + fn_sarcasm + 1e-10)
    recall_non_sarcasm = tn_non_sarcasm / (tn_non_sarcasm + fp_sarcasm + 1e-10)

    return {
        "accuracy": correct_predictions.float() / n_examples,
        "precision_sarcasm": precision_sarcasm,
        "recall_sarcasm": recall_sarcasm,
        "precision_non_sarcasm": precision_non_sarcasm,
        "recall_non_sarcasm": recall_non_sarcasm,
        "loss": np.mean(losses),
    }

In [38]:
# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)  # Send the model to GPU if available

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3

# Total number of training steps
total_steps = len(train_loader) * num_epochs

# Scheduler for learning rate
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

# Loss function
loss_fn = CrossEntropyLoss()

# Training and Validation
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print("-" * 10)

    # Training phase
    train_output = train_epoch(
        model, train_loader, optimizer, device, scheduler, loss_fn, len(train_dataset)
    )

    print(f"Training Metrics:")
    train_metrics = [
        f"Loss: {train_output['loss']:.4f}",
        f"Accuracy: {train_output['accuracy']:.4f}",
        f"Sarcasm Precision: {train_output['precision_sarcasm']:.4f}",
        f"Sarcasm Recall: {train_output['recall_sarcasm']:.4f}",
        f"Non-Sarcasm Precision: {train_output['precision_non_sarcasm']:.4f}",
        f"Non-Sarcasm Recall: {train_output['recall_non_sarcasm']:.4f}",
    ]
    print(" | ".join(train_metrics))

    # Validation phase
    val_output = eval_model(model, val_loader, device, loss_fn, len(val_dataset))

    print(f"Validation Metrics:")
    val_metrics = [
        f"Loss: {val_output['loss']:.4f}",
        f"Accuracy: {val_output['accuracy']:.4f}",
        f"Sarcasm Precision: {val_output['precision_sarcasm']:.4f}",
        f"Sarcasm Recall: {val_output['recall_sarcasm']:.4f}",
        f"Non-Sarcasm Precision: {val_output['precision_non_sarcasm']:.4f}",
        f"Non-Sarcasm Recall: {val_output['recall_non_sarcasm']:.4f}",
    ]
    print(" | ".join(val_metrics))

    print()

# Save the model
torch.save(model.state_dict(), "sarcastic_model.pth")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------


100%|██████████| 1771/1771 [09:38<00:00,  3.06it/s]


Training Metrics:
Loss: 0.3494 | Accuracy: 0.8383 | Sarcasm Precision: 0.8309 | Sarcasm Recall: 0.8312 | Non-Sarcasm Precision: 0.8450 | Non-Sarcasm Recall: 0.8448


100%|██████████| 380/380 [00:40<00:00,  9.45it/s]


Validation Metrics:
Loss: 0.3157 | Accuracy: 0.8623 | Sarcasm Precision: 0.8939 | Sarcasm Recall: 0.8109 | Non-Sarcasm Precision: 0.8376 | Non-Sarcasm Recall: 0.9102

Epoch 2/3
----------


100%|██████████| 1771/1771 [10:57<00:00,  2.69it/s]


Training Metrics:
Loss: 0.1788 | Accuracy: 0.9248 | Sarcasm Precision: 0.9195 | Sarcasm Recall: 0.9237 | Non-Sarcasm Precision: 0.9297 | Non-Sarcasm Recall: 0.9258


100%|██████████| 380/380 [00:42<00:00,  9.04it/s]


Validation Metrics:
Loss: 0.3026 | Accuracy: 0.8845 | Sarcasm Precision: 0.8699 | Sarcasm Recall: 0.8945 | Non-Sarcasm Precision: 0.8989 | Non-Sarcasm Recall: 0.8751

Epoch 3/3
----------


100%|██████████| 1771/1771 [11:44<00:00,  2.51it/s]


Training Metrics:
Loss: 0.0818 | Accuracy: 0.9683 | Sarcasm Precision: 0.9643 | Sarcasm Recall: 0.9697 | Non-Sarcasm Precision: 0.9720 | Non-Sarcasm Recall: 0.9670


100%|██████████| 380/380 [00:41<00:00,  9.20it/s]


Validation Metrics:
Loss: 0.3940 | Accuracy: 0.8786 | Sarcasm Precision: 0.8963 | Sarcasm Recall: 0.8464 | Non-Sarcasm Precision: 0.8637 | Non-Sarcasm Recall: 0.9086

