In [2]:
import time
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
import pandas as pd

In [3]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Load the tokenizer and the RoBERTa model with 2 labels for binary classification
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [5]:
def load_data():
    train_data = pd.read_csv("/kaggle/input/hate-comments/balanced_comments_dataset.csv")
    test_data = pd.read_csv("/kaggle/input/hate-comments/balanced_test_dataset.csv")
    train_data['comment'] = train_data['comment'].astype(str)
    test_data['comment'] = test_data['comment'].astype(str)
    return train_data, test_data

In [6]:
def tokenize_function(examples):
    return tokenizer(examples['comment'], padding='max_length', truncation=True)

In [7]:
def prepare_data(train_data, test_data, use_weighted_sampler=True):
    train_dataset = Dataset.from_pandas(train_data[['comment', 'toxic']])
    test_dataset = Dataset.from_pandas(test_data[['comment', 'toxic']])
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'toxic'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'toxic'])
   
    batch_size = 8
    if use_weighted_sampler:
        class_counts = train_data['toxic'].value_counts()
        sample_weights = [1.0 / class_counts[c] for c in train_data['toxic']]
        sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
    else:
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
   
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    return train_loader, test_loader

In [8]:
def train_model(config_name, use_mixed_precision=True, use_grad_accum=True, use_early_stopping=True, use_weighted_sampler=True):
    train_data, test_data = load_data()
    train_loader, test_loader = prepare_data(train_data, test_data, use_weighted_sampler)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
   
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_epochs = 5
    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
   
    loss_fn = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler() if use_mixed_precision else None
    accumulation_steps = 4 if use_grad_accum else 1
    early_stopping_patience = 3 if use_early_stopping else None
   
    best_val_loss = float('inf')
    patience_counter = 0
   
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
           
            if use_mixed_precision:
                with torch.cuda.amp.autocast():
                    outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['toxic'])
                    loss = outputs.loss / accumulation_steps
                scaler.scale(loss).backward()
            else:
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['toxic'])
                loss = outputs.loss / accumulation_steps
                loss.backward()
           
            if (i + 1) % accumulation_steps == 0 or (i + 1) == len(train_loader):
                if use_mixed_precision:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()
                lr_scheduler.step()
           
            total_train_loss += loss.item() * accumulation_steps
       
        avg_train_loss = total_train_loss / len(train_loader)
        model.eval()
        total_val_loss = 0
        all_preds, all_labels = [], []
       
        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['toxic'])
                total_val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
                labels = batch['toxic'].cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels)
       
        avg_val_loss = total_val_loss / len(test_loader)
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
       
        print(f"{config_name} | Epoch {epoch + 1}: Train Loss {avg_train_loss:.4f}, Val Loss {avg_val_loss:.4f}, Acc {accuracy:.4f}, F1 {f1:.4f}")
       
        if use_early_stopping and avg_val_loss >= best_val_loss:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                print(f"{config_name}: Early stopping triggered")
                break
        else:
            best_val_loss = avg_val_loss
            patience_counter = 0
   
    training_time = time.time() - start_time
    return accuracy, f1, training_time

In [9]:
def run_ablation_study():
    configs = [
        ("Full Model", True, True, True, True),
        ("Without Mixed Precision", False, True, True, True),
        ("Without Gradient Accumulation", True, False, True, True),
        ("Without Early Stopping", True, True, False, True),
        ("Basic Fine-tuning", False, False, False, False)
    ]
   
    results = []
    for config_name, mixed_precision, grad_accum, early_stopping, weighted_sampler in configs:
        acc, f1, train_time = train_model(config_name, mixed_precision, grad_accum, early_stopping, weighted_sampler)
        results.append((config_name, acc, f1, train_time))
   
    print("\nAblation Study Results")
    print("Configuration | Accuracy | F1 Score | Training Time")
    for res in results:
        print(f"{res[0]} | {res[1]*100:.2f}% | {res[2]:.2f} | {res[3]:.2f}x")

In [None]:
run_ablation_study()

Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler() if use_mixed_precision else None
  with torch.cuda.amp.autocast():


Full Model | Epoch 1: Train Loss 0.2461, Val Loss 0.2756, Acc 0.8857, F1 0.8958


  with torch.cuda.amp.autocast():


Full Model | Epoch 2: Train Loss 0.1643, Val Loss 0.4241, Acc 0.8977, F1 0.9057


  with torch.cuda.amp.autocast():


Full Model | Epoch 3: Train Loss 0.1572, Val Loss 0.2592, Acc 0.9080, F1 0.9134


  with torch.cuda.amp.autocast():


Full Model | Epoch 4: Train Loss 0.1364, Val Loss 0.3037, Acc 0.8972, F1 0.9052


  with torch.cuda.amp.autocast():


Full Model | Epoch 5: Train Loss 0.1200, Val Loss 0.3586, Acc 0.8979, F1 0.9053


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Without Mixed Precision | Epoch 1: Train Loss 0.2512, Val Loss 0.2466, Acc 0.9072, F1 0.9112
Without Mixed Precision | Epoch 2: Train Loss 0.1788, Val Loss 0.3631, Acc 0.8782, F1 0.8905
Without Mixed Precision | Epoch 3: Train Loss 0.1532, Val Loss 0.3387, Acc 0.8839, F1 0.8947
Without Mixed Precision | Epoch 4: Train Loss 0.1376, Val Loss 0.4129, Acc 0.8818, F1 0.8932
Without Mixed Precision: Early stopping triggered


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler() if use_mixed_precision else None
  with torch.cuda.amp.autocast():


Without Gradient Accumulation | Epoch 1: Train Loss 0.1834, Val Loss 0.2945, Acc 0.9031, F1 0.9095


  with torch.cuda.amp.autocast():


Without Gradient Accumulation | Epoch 2: Train Loss 0.1062, Val Loss 0.4376, Acc 0.8868, F1 0.8975


  with torch.cuda.amp.autocast():


Without Gradient Accumulation | Epoch 3: Train Loss 0.0741, Val Loss 0.4743, Acc 0.9040, F1 0.9107


  with torch.cuda.amp.autocast():


Without Gradient Accumulation | Epoch 4: Train Loss 0.0457, Val Loss 0.4858, Acc 0.9021, F1 0.9091
Without Gradient Accumulation: Early stopping triggered


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler() if use_mixed_precision else None
  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 1: Train Loss 0.2611, Val Loss 0.2846, Acc 0.8966, F1 0.9038


  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 2: Train Loss 0.1842, Val Loss 0.3129, Acc 0.8961, F1 0.9040


  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 3: Train Loss 0.1555, Val Loss 0.2697, Acc 0.8975, F1 0.9028


  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 4: Train Loss 0.1351, Val Loss 0.3100, Acc 0.9008, F1 0.9073


  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 5: Train Loss 0.1230, Val Loss 0.3787, Acc 0.8875, F1 0.8976


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Basic Fine-tuning | Epoch 1: Train Loss 0.1953, Val Loss 0.2431, Acc 0.9080, F1 0.9140
Basic Fine-tuning | Epoch 2: Train Loss 0.1163, Val Loss 0.3129, Acc 0.9053, F1 0.9119
Basic Fine-tuning | Epoch 3: Train Loss 0.0710, Val Loss 0.3949, Acc 0.8918, F1 0.9010
Basic Fine-tuning | Epoch 4: Train Loss 0.0373, Val Loss 0.4874, Acc 0.8980, F1 0.9055
