# Sarcasm detection with BERT


## Fine tunning on a combination of datasets


### Importing the dataset


In [1]:
import pandas as pd

In [2]:
# Load the dataset
combined_df_file_path = "../datasets/combined.parquet"
combined_df = pd.read_parquet(combined_df_file_path)

# Display the first few rows of the dataset for a quick overview
combined_df.head()

Unnamed: 0,sentence,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,1.0
1,dem rep. totally nails why congress is falling...,0.0
2,eat your veggies: 9 deliciously different recipes,0.0
3,inclement weather prevents liar from getting t...,1.0
4,mother comes pretty close to using word 'strea...,1.0


### Some statistics and cleaning


In [3]:
import re

In [4]:
# Checking for any null values in the dataset
combined_df_null_check = combined_df.isnull().sum()

# Data cleaning: removing special characters and escape sequences from the sentences
combined_df["sentence"] = combined_df["sentence"].apply(
    lambda x: re.sub(r"[\n\r\t]+", " ", x)
)

# Checking the distribution of the 'is_sarcastic' column
combined_df_label_distribution = combined_df["is_sarcastic"].value_counts(
    normalize=True
)

combined_df_null_check, combined_df_label_distribution

(sentence        0
 is_sarcastic    0
 dtype: int64,
 is_sarcastic
 0.0    0.521391
 1.0    0.478609
 Name: proportion, dtype: float64)

### Splitting the dataset


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
# Splitting the dataset into training, validation, and testing sets
combined_train_data, combined_test_data = train_test_split(
    combined_df, test_size=0.3, random_state=42
)
combined_val_data, combined_test_data = train_test_split(
    combined_test_data, test_size=0.5, random_state=42
)

# Showing the size of each split
combined_train_size, combined_val_size, combined_test_size = (
    len(combined_train_data),
    len(combined_val_data),
    len(combined_test_data),
)
combined_train_size, combined_val_size, combined_test_size

(28322, 6069, 6070)

### Creating the Dataset class for the BertTokenizer & PyTorch


In [7]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
class SarcasticSentencesDataset(Dataset):
    """
    A custom PyTorch Dataset for the sarcastic sentences dataset.
    """

    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        label = self.labels[item]

        # Encoding the sentences using the tokenizer
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",  # Return PyTorch tensors
            truncation=True,
        )

        return {
            "sentence": sentence,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [9]:
# Initialize the BERT tokenizer
bert_base_uncased_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Constants
COMBINED_MAX_LEN = 128  # Maximum length of the tokens list
COMBINED_BATCH_SIZE = 16

# Creating instances of the SarcasticSentencesDataset
combined_train_dataset = SarcasticSentencesDataset(
    combined_train_data["sentence"].to_numpy(),
    combined_train_data["is_sarcastic"].to_numpy(),
    bert_base_uncased_tokenizer,
    COMBINED_MAX_LEN,
)

combined_val_dataset = SarcasticSentencesDataset(
    combined_val_data["sentence"].to_numpy(),
    combined_val_data["is_sarcastic"].to_numpy(),
    bert_base_uncased_tokenizer,
    COMBINED_MAX_LEN,
)

combined_test_dataset = SarcasticSentencesDataset(
    combined_test_data["sentence"].to_numpy(),
    combined_test_data["is_sarcastic"].to_numpy(),
    bert_base_uncased_tokenizer,
    COMBINED_MAX_LEN,
)

# Creating the DataLoaders for training, validation, and testing
combined_train_loader = DataLoader(
    combined_train_dataset, batch_size=COMBINED_BATCH_SIZE, shuffle=True
)
combined_val_loader = DataLoader(combined_val_dataset, batch_size=COMBINED_BATCH_SIZE)
combined_test_loader = DataLoader(combined_test_dataset, batch_size=COMBINED_BATCH_SIZE)

# Checking the first batch from the train_loader
next(iter(combined_train_loader))

{'sentence': ["The meticulously preserved, unchanging 66 Books of the Bible aren't compelling?",
  "And I'm sure you of all people know the mind of God... Please, you just take snippets of scripture, and twist it into well this is absolute. God loves gays and lesbians, but you fling god out as a way to justify your slander and bigotry.  ",
  'exclusive: bet responds after coming under fire from journalists and publicists',
  "saving the world's last 3 northern white rhino",
  'allies: islamist motive for killing nemtsov is nonsense',
  'laid-off zoologist goes on tranquilizing rampage',
  'annoying man more annoying after skydiving',
  'paris terror harms france, islam, and the world',
  "former patriots and chiefs tackle ryan o'callaghan comes out as gay",
  '"Placeholder" would honestly be a better name than Pied Piper.',
  "snapchat's snapcash: is peer-to-peer payment safe?",
  'grown man purchases 37th sailor moon figurine',
  'why we should tip service workers generously',
  'emot

### Creating the train and validation loops


In [10]:
# Torch imports
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss, Module
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler, LambdaLR

# Transformers imports
from transformers import (
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)

# Typing imports
from typing import Dict, Optional, List, Union

# Other libraries
from tqdm import tqdm
import numpy as np

In [11]:
device: torch.device = torch.device(
    device="cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [12]:
def train_epoch(
    model: Module,
    data_loader: DataLoader,
    optimizer: Optimizer,
    device: torch.device,
    scheduler: Union[_LRScheduler, LambdaLR],
    loss_fn: CrossEntropyLoss,
    n_examples: int,
    feature_keys: Optional[List[str]] = None,  # List of keys if present
) -> Dict[str, float]:
    model.train()

    losses = []
    correct_predictions = torch.Tensor([0]).to(device)

    # For calculating precision and recall
    tp_sarcasm = 0
    tn_non_sarcasm = 0
    fp_sarcasm = 0
    fn_sarcasm = 0

    for batch in tqdm(data_loader, total=len(data_loader)):
        # Process inputs for single/multiple features
        inputs = (
            {key: batch[key].to(device) for key in feature_keys}
            if feature_keys
            else {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device),
            }
        )
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(**inputs)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

        # Update TP, TN, FP, FN counters
        tp_sarcasm += (preds & labels).sum().item()
        tn_non_sarcasm += ((~preds.byte()) & (~labels.byte())).sum().item()
        fp_sarcasm += (preds & (~labels.byte())).sum().item()
        fn_sarcasm += ((~preds.byte()) & labels).sum().item()

    # Calculate precision and recall for sarcasm class
    precision_sarcasm = tp_sarcasm / (tp_sarcasm + fp_sarcasm)
    recall_sarcasm = tp_sarcasm / (tp_sarcasm + fn_sarcasm)

    # Calculate precision and recall for non-sarcasm class
    precision_non_sarcasm = tn_non_sarcasm / (tn_non_sarcasm + fn_sarcasm)
    recall_non_sarcasm = tn_non_sarcasm / (tn_non_sarcasm + fp_sarcasm)

    return {
        "accuracy": correct_predictions.float().item() / n_examples,
        "precision_sarcasm": precision_sarcasm,
        "recall_sarcasm": recall_sarcasm,
        "precision_non_sarcasm": precision_non_sarcasm,
        "recall_non_sarcasm": recall_non_sarcasm,
        "loss": np.mean(losses),
    }

In [13]:
def eval_model(
    model: Module,
    data_loader: DataLoader,
    device: torch.device,
    loss_fn: CrossEntropyLoss,
    n_examples: int,
    feature_keys: Optional[List[str]] = None,
) -> Dict[str, float]:
    model.eval()

    losses = []
    correct_predictions = torch.Tensor([0]).to(device)

    # Initialize counters for precision and recall
    tp_sarcasm = 0
    tn_non_sarcasm = 0
    fp_sarcasm = 0
    fn_sarcasm = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader)):
            # Process inputs for single/multiple features
            inputs = (
                {key: batch[key].to(device) for key in feature_keys}
                if feature_keys
                else {
                    "input_ids": batch["input_ids"].to(device),
                    "attention_mask": batch["attention_mask"].to(device),
                }
            )
            labels = batch["labels"].to(device)

            outputs = model(**inputs)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            losses.append(loss.item())

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

            # Update TP, TN, FP, FN counters for precision and recall calculations
            tp_sarcasm += (preds & labels).sum().item()
            tn_non_sarcasm += ((~preds.byte()) & (~labels.byte())).sum().item()
            fp_sarcasm += (preds & (~labels.byte())).sum().item()
            fn_sarcasm += ((~preds.byte()) & labels).sum().item()

    # Calculate precision and recall for sarcasm class
    precision_sarcasm = tp_sarcasm / (tp_sarcasm + fp_sarcasm)
    recall_sarcasm = tp_sarcasm / (tp_sarcasm + fn_sarcasm)

    # Calculate precision and recall for non-sarcasm class
    precision_non_sarcasm = tn_non_sarcasm / (tn_non_sarcasm + fn_sarcasm)
    recall_non_sarcasm = tn_non_sarcasm / (tn_non_sarcasm + fp_sarcasm)

    return {
        "accuracy": correct_predictions.float().item() / n_examples,
        "precision_sarcasm": precision_sarcasm,
        "recall_sarcasm": recall_sarcasm,
        "precision_non_sarcasm": precision_non_sarcasm,
        "recall_non_sarcasm": recall_non_sarcasm,
        "loss": np.mean(losses),
    }

### Training & evaluation of the model


In [15]:
# Hyperparameters

COMBINED_PRETRAINED_MODEL_NAME_OR_PATH = "bert-base-uncased"
COMBINED_NUM_LABELS = 2  # Number of labels in the dataset
COMBINED_HIDDEN_DROPOUT_PROB = 0.3  # Dropout rate
COMBINED_ATTENTION_PROBS_DROPOUT_PROB = 0.3  # Dropout rate in attention heads
COMBINED_NUM_EPOCHS = 3  # Number of epochs
COMBINED_LR = 2e-5  # Learning rate
COMBINED_WEIGHT_DECAY = 0.01  # Weight decay for regularization
COMBINED_NUM_WARMUP_STEPS = 0  # Number of warmup steps for learning rate scheduler

# Load pre-trained model
combined_model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=COMBINED_PRETRAINED_MODEL_NAME_OR_PATH,
    num_labels=COMBINED_NUM_LABELS,
    hidden_dropout_prob=COMBINED_HIDDEN_DROPOUT_PROB,  # dropout rate,
    attention_probs_dropout_prob=COMBINED_ATTENTION_PROBS_DROPOUT_PROB,  # dropout rate in attention heads
)

# For typing purposes, check if model is an instance of Module
if not isinstance(combined_model, Module):
    raise ValueError("Model must be an instance of Module")

# Send the model to GPU if available
combined_model.to(device=device)  # type: ignore

# Optimizer
combined_optimizer = AdamW(
    combined_model.parameters(), lr=COMBINED_LR, weight_decay=COMBINED_WEIGHT_DECAY
)

# Total number of training steps
combined_total_steps = len(combined_train_loader) * COMBINED_NUM_EPOCHS

# Scheduler for learning rate
combined_scheduler = get_linear_schedule_with_warmup(
    combined_optimizer,
    num_warmup_steps=COMBINED_NUM_WARMUP_STEPS,
    num_training_steps=combined_total_steps,
)

# Loss function
combined_loss_fn = CrossEntropyLoss()

# Feature keys
combined_feature_keys = ["input_ids", "attention_mask"]

# Training and Validation
for epoch in range(COMBINED_NUM_EPOCHS):
    print(f"Epoch {epoch + 1}/{COMBINED_NUM_EPOCHS}")
    print("-" * 10)

    # Training phase
    train_output = train_epoch(
        model=combined_model,
        data_loader=combined_train_loader,
        optimizer=combined_optimizer,
        device=device,
        scheduler=combined_scheduler,
        loss_fn=combined_loss_fn,
        n_examples=len(combined_train_dataset),
        feature_keys=combined_feature_keys,
    )

    print(f"Training Metrics:")
    train_metrics = [
        f"Loss: {train_output['loss']:.4f}",
        f"Accuracy: {train_output['accuracy']:.4f}",
        f"Sarcasm Precision: {train_output['precision_sarcasm']:.4f}",
        f"Sarcasm Recall: {train_output['recall_sarcasm']:.4f}",
        f"Non-Sarcasm Precision: {train_output['precision_non_sarcasm']:.4f}",
        f"Non-Sarcasm Recall: {train_output['recall_non_sarcasm']:.4f}",
    ]
    print(" | ".join(train_metrics))

    # Validation phase
    val_output = eval_model(
        model=combined_model,
        data_loader=combined_val_loader,
        device=device,
        loss_fn=combined_loss_fn,
        n_examples=len(combined_val_dataset),
        feature_keys=combined_feature_keys,
    )

    print(f"Validation Metrics:")
    val_metrics = [
        f"Loss: {val_output['loss']:.4f}",
        f"Accuracy: {val_output['accuracy']:.4f}",
        f"Sarcasm Precision: {val_output['precision_sarcasm']:.4f}",
        f"Sarcasm Recall: {val_output['recall_sarcasm']:.4f}",
        f"Non-Sarcasm Precision: {val_output['precision_non_sarcasm']:.4f}",
        f"Non-Sarcasm Recall: {val_output['recall_non_sarcasm']:.4f}",
    ]
    print(" | ".join(val_metrics))

    print()

# Save the model
PRETRAINED_BERT_PATH = "sarcastic_model.pth"
torch.save(combined_model.state_dict(), PRETRAINED_BERT_PATH)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------


100%|██████████| 1771/1771 [09:58<00:00,  2.96it/s]


Training Metrics:
Loss: 0.4146 | Accuracy: 0.8031 | Sarcasm Precision: 0.7991 | Sarcasm Recall: 0.7860 | Non-Sarcasm Precision: 0.9996 | Non-Sarcasm Recall: 0.9996


100%|██████████| 380/380 [00:40<00:00,  9.30it/s]


Validation Metrics:
Loss: 0.3539 | Accuracy: 0.8423 | Sarcasm Precision: 0.8892 | Sarcasm Recall: 0.7693 | Non-Sarcasm Precision: 0.9996 | Non-Sarcasm Recall: 0.9998

Epoch 2/3
----------


 64%|██████▎   | 1125/1771 [07:08<04:08,  2.60it/s]

## Create a new model based on the pre-trained BERT model, adding review features


### Create the dataset class for the new model


In [None]:
class SarcasticProductReviewDataset(Dataset):
    """
    A PyTorch Dataset class for sarcastic product reviews with multiple text features.
    """

    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review_data = self.data.iloc[idx]
        label = review_data["is_sarcastic"]

        # Tokenizing each text feature separately
        title_encoding = self.tokenize_text_feature(review_data["title"])
        author_encoding = self.tokenize_text_feature(review_data["author"])
        product_encoding = self.tokenize_text_feature(review_data["product"])
        review_encoding = self.tokenize_text_feature(review_data["review"])

        # Convert stars rating to a tensor
        stars_rating = torch.tensor([float(review_data["stars"])], dtype=torch.float)

        return {
            "title_input_ids": title_encoding["input_ids"].flatten(),
            "title_attention_mask": title_encoding["attention_mask"].flatten(),
            "author_input_ids": author_encoding["input_ids"].flatten(),
            "author_attention_mask": author_encoding["attention_mask"].flatten(),
            "product_input_ids": product_encoding["input_ids"].flatten(),
            "product_attention_mask": product_encoding["attention_mask"].flatten(),
            "review_input_ids": review_encoding["input_ids"].flatten(),
            "review_attention_mask": review_encoding["attention_mask"].flatten(),
            "stars": stars_rating.flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

    def tokenize_text_feature(self, text):
        return self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,  # truncate or pad to max_len
            return_token_type_ids=False,
            padding="max_length",  # pad to max_length
            return_attention_mask=True,
            return_tensors="pt",  # return tensors for PyTorch
            truncation=True,
        )

### Create the model class


In [None]:
import torch
from transformers import BertModel
import torch.nn as nn


class ExtendedBertForMultiFeatureClassification(nn.Module):
    def __init__(
        self,
        pretrained_bert_path,
        hidden_size,
        num_labels,
        hidden_dropout_prob,
        attention_probs_dropout_prob,
        star_hidden_dropout_prob,
    ):
        super().__init__()

        self.bert = BertModel.from_pretrained(
            pretrained_model_name_or_path=pretrained_bert_path,
            num_labels=num_labels,
            hidden_dropout_prob=hidden_dropout_prob,  # dropout rate,
            attention_probs_dropout_prob=attention_probs_dropout_prob,  # dropout rate in attention heads
        )
        if not isinstance(self.bert, Module):
            raise ValueError("Model must be an instance of Module")

        self.bert.load_state_dict(
            torch.load(pretrained_bert_path, map_location=torch.device("cpu"))
        )

        # Assuming features for title, author, product, review
        num_features = 4  # how many text features we're combining

        # The feature combiner layer (you could have more complex architecture here if necessary)
        self.feature_combiner = nn.Linear(hidden_size * num_features, hidden_size)

        # The classifier head
        self.classifier = nn.Linear(
            hidden_size + 1, num_labels
        )  # +1 for the star rating feature
        self.dropout = nn.Dropout(star_hidden_dropout_prob)

    def forward(
        self,
        title_input_ids,
        title_attention_mask,
        author_input_ids,
        author_attention_mask,
        product_input_ids,
        product_attention_mask,
        review_input_ids,
        review_attention_mask,
        stars,
    ):
        if not isinstance(self.bert, Module):
            raise ValueError("Model must be an instance of Module")

        # Process each text input through the fine-tuned BERT independently
        # Extract the last hidden state of the [CLS] token from each output
        title_cls = self.bert(
            title_input_ids, attention_mask=title_attention_mask
        ).pooler_output
        author_cls = self.bert(
            author_input_ids, attention_mask=author_attention_mask
        ).pooler_output
        product_cls = self.bert(
            product_input_ids, attention_mask=product_attention_mask
        ).pooler_output
        review_cls = self.bert(
            review_input_ids, attention_mask=review_attention_mask
        ).pooler_output

        # Combine [CLS] token outputs for all text features
        combined_cls = torch.cat(
            (title_cls, author_cls, product_cls, review_cls), dim=1
        )

        # Apply dropout and pass through the combiner layer
        combined_features = self.dropout(self.feature_combiner(combined_cls))

        # Concatenate the stars rating to the combined text features
        combined_input = torch.cat(
            (combined_features, stars.unsqueeze(1)), dim=1
        )  # Ensure stars has the correct shape
        # Pass combined features through the final classifier layer
        logits = self.classifier(combined_input)

        return logits

### Load the dataset


In [None]:
# Load the dataset
amz_combined_file_path = "../datasets/amazon_combined.parquet"
amz_combined_df = pd.read_parquet(amz_combined_file_path)

# Display the first few rows of the dataset for a quick overview
amz_combined_df.head()

### Clean the dataset


In [None]:
# Data cleaning: removing special characters and escape sequences from the sentences
amz_combined_df["sentence"] = amz_combined_df["sentence"].apply(
    lambda x: re.sub(r"[\n\r\t]+", " ", x)
)

# Checking for any null values in the dataset
amz_combined_null_check = amz_combined_df.isnull().sum()

# Checking the distribution of the 'is_sarcastic' column
amz_combined_label_distribution = amz_combined_df["is_sarcastic"].value_counts(
    normalize=True
)

amz_combined_null_check, amz_combined_label_distribution

In [None]:
# Splitting the dataset into training, validation, and testing sets
amz_combined_train_data, amz_combined_test_data = train_test_split(
    amz_combined_df, test_size=0.3, random_state=42
)
amz_combined_val_data, amz_combined_test_data = train_test_split(
    amz_combined_test_data, test_size=0.5, random_state=42
)

# Showing the size of each split
amz_combined_train_size, amz_combined_val_size, amz_combined_test_size = (
    len(amz_combined_train_data),
    len(amz_combined_val_data),
    len(amz_combined_test_data),
)
amz_combined_train_size, amz_combined_val_size, amz_combined_test_size

In [None]:
# Hyperparameters
AMZ_COMBINED_PRETRAINED_MODEL_NAME_OR_PATH = COMBINED_PRETRAINED_MODEL_NAME_OR_PATH
AMZ_COMBINED_NUM_LABELS = COMBINED_NUM_LABELS
AMZ_COMBINED_HIDDEN_DROPOUT_PROB = COMBINED_HIDDEN_DROPOUT_PROB
AMZ_COMBINED_ATTENTION_PROBS_DROPOUT_PROB = COMBINED_ATTENTION_PROBS_DROPOUT_PROB

# Specific hyperparameters for the amz model
AMZ_COMBINED_HIDDEN_SIZE = 768  # Default hidden size for BERT base
AMZ_COMBINED_STAR_HIDDEN_DROPOUT_PROB = 0.3  # Dropout rate for the star rating feature

# Instantiate the extended model
amz_combined_model = ExtendedBertForMultiFeatureClassification(
    pretrained_bert_path=AMZ_COMBINED_PRETRAINED_MODEL_NAME_OR_PATH,
    hidden_size=AMZ_COMBINED_HIDDEN_SIZE,
    num_labels=AMZ_COMBINED_NUM_LABELS,
    hidden_dropout_prob=AMZ_COMBINED_HIDDEN_DROPOUT_PROB,
    attention_probs_dropout_prob=AMZ_COMBINED_ATTENTION_PROBS_DROPOUT_PROB,
    star_hidden_dropout_prob=AMZ_COMBINED_STAR_HIDDEN_DROPOUT_PROB,
)

# Send the model to GPU if available
amz_combined_model.to(device)  # type: ignore

amz_combined_feature_keys = [
    "title_input_ids",
    "title_attention_mask",
    "author_input_ids",
    "author_attention_mask",
    "product_input_ids",
    "product_attention_mask",
    "review_input_ids",
    "review_attention_mask",
    "stars",
]