<a href="https://colab.research.google.com/github/itay-matityahu/comp_learning_2026/blob/main/bert%2Bcustom_head_sarcasem_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Bert with Custom Head for Sarcasem Detection**

Daniel Nissani & Itay Matityahu


#Setup

## Libraries

In [None]:
!pip install -q transformers datasets evaluate accelerate

import pandas as pd
import torch.nn as nn
from datasets import Dataset
from google.colab import drive
import numpy as np
import evaluate
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, AutoModel, get_scheduler
from torch.utils.data import DataLoader
import torch
from torch.optim import AdamW
import evaluate
from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix
import itertools
from google.colab import drive
drive.mount('/content/drive')



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


##Data Loading

In [None]:
train_path = '/content/drive/My Drive/computational_learning/final_project/data/headline_train.txt'
with open(train_path, 'r', encoding='utf-8') as f:
    train_data = [line.strip().split(None, 3)[2:] for line in f]
train_df = pd.DataFrame(train_data, columns=['label', 'headline'])

test_path = '/content/drive/My Drive/computational_learning/final_project/data/headline_test.txt'
with open(test_path, 'r', encoding='utf-8') as f:
    test_data = [line.strip().split(None, 3)[2:] for line in f]
test_df = pd.DataFrame(test_data, columns=['label', 'headline'])


val_path = '/content/drive/My Drive/computational_learning/final_project/data/headline_val.txt'
with open(val_path, 'r', encoding='utf-8') as f:
    val_data = [line.strip().split(None, 3)[2:] for line in f]
val_df = pd.DataFrame(val_data, columns=['label', 'headline'])

train_df['label'] = train_df['label'].astype(int)
val_df['label'] = val_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)

print(f"Train size: {len(train_df)}")
print(f"Val size: {len(val_df)}")
print(f"Test size: {len(test_df)}")
print(train_df.head())

Train size: 21367
Val size: 2670
Test size: 2672
   label                                           headline
0      1  fed-up employee just about 14 years away from ...
1      1           baseball slugger on pace to hit 60 women
2      0  watch : hungry moose shops for groceries in sa...
3      0    9 things i learned during a year of first dates
4      0  ryan reynolds knows blake lively slays the red...


## Data Analysis

In [None]:
all_labels = pd.concat([train_df['label'], val_df['label'], test_df['label']])
counts = all_labels.value_counts()
percentages = all_labels.value_counts(normalize=True) * 100

print("Distribution of Sarcasm in Dataset:")
print("-" * 30)
for label, count in counts.items():
    status = "Sarcastic" if label == '1' else "Non-Sarcastic"
    print(f"{status} ({label}): {count} samples ({percentages[label]:.2f}%)")

Distribution of Sarcasm in Dataset:
------------------------------
Non-Sarcastic (0): 14985 samples (56.10%)
Non-Sarcastic (1): 11724 samples (43.90%)


As we can see, and in continues to the paper method we will use accuracy to measure the nodel since the data is balanced

# Model Setup

##Tokenization Setup

In [None]:
MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["headline"],
        truncation=True,
        max_length=64,
        padding="max_length"
    )
def tokenize_fn(batch):
    tokenized = tokenizer(batch["headline"], truncation=True, max_length=64, padding="max_length")

    # הוספה מפורשת של הלייבלים לתוצאה
    tokenized["label"] = [int(l) for l in batch["label"]]

    return tokenized



##Architecture Setup

In [None]:
class DebertaWithCustomHead(nn.Module):
    def __init__(
        self,
        model_name: str = "microsoft/deberta-v3-base",
        num_labels: int = 2,
        head_hidden_sizes=(256,),
        dropout: float = 0.2,
        activation: str = "gelu",
        pooling: str = "cls",       # "cls" or "mean"
        freeze_encoder: bool = False,
    ):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.num_labels = num_labels
        self.pooling = pooling

        hidden_size = self.encoder.config.hidden_size

        act = nn.GELU() if activation.lower() == "gelu" else nn.ReLU()

        layers = []
        in_dim = hidden_size
        for h in head_hidden_sizes:
            layers += [
                nn.Dropout(dropout),
                nn.Linear(in_dim, h),
                act,
                nn.LayerNorm(h),
            ]
            in_dim = h

        layers += [
            nn.Dropout(dropout),
            nn.Linear(in_dim, num_labels),
        ]
        self.classifier = nn.Sequential(*layers)

        self.loss_fn = nn.CrossEntropyLoss()

        if freeze_encoder:
            for p in self.encoder.parameters():
                p.requires_grad = False

    def _mean_pool(self, last_hidden_state, attention_mask):
        # attention_mask: (B, T) with 1 for tokens, 0 for pad
        mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)  # (B,T,1)
        summed = (last_hidden_state * mask).sum(dim=1)                  # (B,H)
        counts = mask.sum(dim=1).clamp(min=1e-6)                        # (B,1)
        return summed / counts

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        last_hidden = out.last_hidden_state  # (B, T, H)

        if self.pooling == "mean":
            pooled = self._mean_pool(last_hidden, attention_mask)
        else:
            pooled = last_hidden[:, 0]  # [CLS] token representation

        logits = self.classifier(pooled)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}


#Training

### Helper Functions

In [None]:
@torch.no_grad()
def eval_accuracy(model, val_loader, device):
    model.eval()

    correct = 0
    total = 0

    for batch in val_loader:
        # move tensors to device
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch['label']

        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
        )
        logits = outputs["logits"] # :)
        preds = torch.argmax(logits, dim=-1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return correct / total


# ---------------------------------------
# Train one configuration (one HP set)
# ---------------------------------------
def train_one_config(
    train_loader,
    model_name,
    num_labels,
    device,
    # hyperparameters:
    lr=2e-5,
    scheduler_type="linear",
    pooling="cls",            # "cls" or "mean"
    head_hidden_sizes=(512, 256),
    dropout=0.2,
    weight_decay=0.01,
    num_epochs=2,
    train_batch_size=16,
    warmup_ratio=0.06,
    grad_accum_steps=1,
    max_grad_norm=1.0,
    use_amp=True,
):
    # build model fresh for each run
    model = DebertaWithCustomHead(
        model_name=model_name,
        num_labels=num_labels,
        head_hidden_sizes=head_hidden_sizes,
        dropout=dropout,
        pooling=pooling,
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    steps_per_epoch = int(np.ceil(len(train_loader) / grad_accum_steps))
    total_steps = steps_per_epoch * num_epochs
    warmup_steps = int(total_steps * warmup_ratio)

    lr_scheduler = get_scheduler(
        name=scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps,
    )

    scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and device.type == "cuda"))

    model.train()
    global_step = 0

    for epoch in range(num_epochs):
        pbar = tqdm(train_loader, desc=f"train epoch {epoch+1}/{num_epochs}", leave=False)
        optimizer.zero_grad(set_to_none=True)

        for step, batch in enumerate(pbar):
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch['label']

            with torch.cuda.amp.autocast(enabled=(use_amp and device.type == "cuda")):
                out = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=labels,
                )
                loss = out["loss"] / grad_accum_steps

            scaler.scale(loss).backward()

            if (step + 1) % grad_accum_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
                lr_scheduler.step()

                global_step += 1
                pbar.set_postfix({"loss": float(loss.item() * grad_accum_steps),
                                  "lr": lr_scheduler.get_last_lr()[0]})

    return model


# -------------------------
# Grid search
# -------------------------
def grid_search(
    train_loader,
    val_loader,
    model_name="microsoft/deberta-v3-base",
    num_labels=2,
    # grid:
    lrs=(1e-5, 2e-5),
    schedulers=("linear",),
    poolings=("cls", "mean"),
    heads=((512, 256), (256, 128, 64)),
    dropouts=(0.2,),
    # training budget:
    num_epochs=2,
    train_batch_size=16,
    grad_accum_steps=1,
    val_batch_size=32,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    best_acc = -1.0
    best_cfg = None
    best_state = None

    configs = list(itertools.product(lrs, schedulers, poolings, heads, dropouts))
    print("Total configs:", len(configs))

    for lr, sched, pooling, head, dropout in configs:
        cfg = {"lr": lr, "scheduler": 'linear', "pooling": pooling, "head": head, "dropout": dropout}
        print("\nConfig:", cfg)

        try:
            model = train_one_config(
                train_loader=train_loader,
                model_name=model_name,
                num_labels=num_labels,
                device=device,
                lr=lr,
                scheduler_type=sched,
                pooling=pooling,
                head_hidden_sizes=head,
                dropout=dropout,
                num_epochs=num_epochs,
                grad_accum_steps=grad_accum_steps,
                use_amp=True,
            )

            acc = eval_accuracy(model, val_loader, device)
            print("VAL accuracy:", acc)

        except RuntimeError as e:
            if "out of memory" in str(e).lower():
                print("OOM. Reduce train_batch_size or increase grad_accum_steps.")
                if device.type == "cuda":
                    torch.cuda.empty_cache()
                continue
            raise

        if acc > best_acc:
            best_acc = acc
            best_cfg = cfg
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    print("\nBEST:", best_cfg)
    print("BEST VAL accuracy:", best_acc)

    # rebuild best model object + load best weights
    best_model = DebertaWithCustomHead(
        model_name=model_name,
        num_labels=num_labels,
        head_hidden_sizes=best_cfg["head"],
        dropout=best_cfg["dropout"],
        pooling=best_cfg["pooling"],
    )
    best_model.load_state_dict(best_state)

    return best_model, best_cfg, best_acc

def custom_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = torch.tensor([item['label'] for item in batch])

    batch_padded = tokenizer.pad(
        {'input_ids': input_ids, 'attention_mask': attention_mask},
        padding=True,
        return_tensors='pt'
    )

    batch_padded['label'] = labels
    return batch_padded

### Run Train

In [None]:
# create to hugging face data set
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

train_df_token = train_ds.map(tokenize_fn, batched=True, remove_columns=["headline"])
val_df_token  = val_ds.map(tokenize_fn, batched=True, remove_columns=["headline"])
test_df_token  = test_ds.map(tokenize_fn, batched=True, remove_columns=["headline"])

# collator (pads batches)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#dataloaders with padding
train_loader = DataLoader(train_df_token, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)
val_loader  = DataLoader(val_df_token,  batch_size=32, shuffle=False, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_df_token,  batch_size=32, shuffle=False, collate_fn=custom_collate_fn)

#best_model, best_cfg, best_acc = grid_search(train_loader,val_loader)

Map:   0%|          | 0/21367 [00:00<?, ? examples/s]

Map:   0%|          | 0/2670 [00:00<?, ? examples/s]

Map:   0%|          | 0/2672 [00:00<?, ? examples/s]

Total configs: 8

Config: {'lr': 1e-05, 'scheduler': 'linear', 'pooling': 'cls', 'head': (512, 256), 'dropout': 0.2}


  scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and device.type == "cuda"))


train epoch 1/2:   0%|          | 0/668 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  with torch.cuda.amp.autocast(enabled=(use_amp and device.type == "cuda")):


train epoch 2/2:   0%|          | 0/668 [00:00<?, ?it/s]

VAL accuracy: 0.9307116104868914

Config: {'lr': 1e-05, 'scheduler': 'linear', 'pooling': 'cls', 'head': (256, 128, 64), 'dropout': 0.2}


train epoch 1/2:   0%|          | 0/668 [00:00<?, ?it/s]

train epoch 2/2:   0%|          | 0/668 [00:00<?, ?it/s]

VAL accuracy: 0.9239700374531835

Config: {'lr': 1e-05, 'scheduler': 'linear', 'pooling': 'mean', 'head': (512, 256), 'dropout': 0.2}


train epoch 1/2:   0%|          | 0/668 [00:00<?, ?it/s]

train epoch 2/2:   0%|          | 0/668 [00:00<?, ?it/s]

VAL accuracy: 0.9325842696629213

Config: {'lr': 1e-05, 'scheduler': 'linear', 'pooling': 'mean', 'head': (256, 128, 64), 'dropout': 0.2}


train epoch 1/2:   0%|          | 0/668 [00:00<?, ?it/s]

train epoch 2/2:   0%|          | 0/668 [00:00<?, ?it/s]

VAL accuracy: 0.9348314606741573

Config: {'lr': 2e-05, 'scheduler': 'linear', 'pooling': 'cls', 'head': (512, 256), 'dropout': 0.2}


train epoch 1/2:   0%|          | 0/668 [00:00<?, ?it/s]

train epoch 2/2:   0%|          | 0/668 [00:00<?, ?it/s]

VAL accuracy: 0.9352059925093633

Config: {'lr': 2e-05, 'scheduler': 'linear', 'pooling': 'cls', 'head': (256, 128, 64), 'dropout': 0.2}


train epoch 1/2:   0%|          | 0/668 [00:00<?, ?it/s]

train epoch 2/2:   0%|          | 0/668 [00:00<?, ?it/s]

VAL accuracy: 0.9393258426966292

Config: {'lr': 2e-05, 'scheduler': 'linear', 'pooling': 'mean', 'head': (512, 256), 'dropout': 0.2}


train epoch 1/2:   0%|          | 0/668 [00:00<?, ?it/s]

train epoch 2/2:   0%|          | 0/668 [00:00<?, ?it/s]

VAL accuracy: 0.9438202247191011

Config: {'lr': 2e-05, 'scheduler': 'linear', 'pooling': 'mean', 'head': (256, 128, 64), 'dropout': 0.2}


train epoch 1/2:   0%|          | 0/668 [00:00<?, ?it/s]

train epoch 2/2:   0%|          | 0/668 [00:00<?, ?it/s]

VAL accuracy: 0.9397003745318352

BEST: {'lr': 2e-05, 'scheduler': 'linear', 'pooling': 'mean', 'head': (512, 256), 'dropout': 0.2}
BEST VAL accuracy: 0.9438202247191011


### Load the best model

In [None]:
#torch.save(best_model.state_dict(), '/content/drive/My Drive/computational_learning/final_project/best_model_weights.pt')
best_cfg = {
    'lr': 2e-05,
    'scheduler': 'linear',
    'pooling': 'mean',
    'head': (512, 256),
    'dropout': 0.2
}

model_name = "microsoft/deberta-v3-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loaded_model = DebertaWithCustomHead(
    model_name=model_name,
    num_labels=2,
    head_hidden_sizes=best_cfg['head'],
    dropout=best_cfg['dropout'],
    pooling=best_cfg['pooling']
).to(device)

path = '/content/drive/My Drive/computational_learning/final_project/best_model_weights.pt'

state_dict = torch.load(path, map_location=device)
loaded_model.load_state_dict(state_dict)

loaded_model.eval()

print("Model loaded successfully with BEST configuration!")

Model loaded successfully with BEST configuration!


### Performance Checking (test set)

In [None]:
@torch.no_grad()
def evaluate_model(model, data_loader, device):
    """
    Evaluates the model using a pre-configured DataLoader.
    Assumes the loader provides 'input_ids', 'attention_mask', and 'label'.
    """
    model.eval()
    all_preds = []
    all_labels = []

    print(f"Generating predictions for {len(data_loader.dataset)} samples...")

    for batch in tqdm(data_loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        # prediction
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["label"].cpu().numpy())

    y_pred = np.array(all_preds)
    y_true = np.array(all_labels)

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    cm_norm = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]) * 100

    print("\n" + "="*45)
    print(f"{'FINAL TEST REPORT':^45}")
    print("="*45)
    print(f"{'':<18} | {'Pred: NON':<10} | {'Pred: SAR':<10}")
    print("-" * 45)
    print(f"{'Actual: NON':<18} | {cm_norm[0,0]:>8.2f}% | {cm_norm[0,1]:>8.2f}%")
    print("-" * 45)
    print(f"{'Actual: SAR':<18} | {cm_norm[1,0]:>8.2f}% | {cm_norm[1,1]:>8.2f}%")
    print("="*45)

    overall_acc = (np.diag(cm).sum() / cm.sum()) * 100
    print(f"Overall Test Accuracy: {overall_acc:.2f}%")

evaluate_model(loaded_model, test_loader, device)

Generating predictions for 2672 samples...


  0%|          | 0/84 [00:00<?, ?it/s]


              FINAL TEST REPORT              
                   | Pred: NON  | Pred: SAR 
---------------------------------------------
Actual: NON        |    96.91% |     3.09%
---------------------------------------------
Actual: SAR        |     8.46% |    91.54%
Overall Test Accuracy: 94.46%


##