In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/testbalance/balanced_test_dataset.csv
/kaggle/input/balanced/balanced_comments_dataset.csv
/kaggle/input/finetune/test_data.csv
/kaggle/input/finetune/traindata.csv


In [2]:
%cd /kaggle/working

from IPython.display import FileLink

# Create a downloadable link for the saved model
FileLink(r'best_bert_model.pth')


/kaggle/working


In [3]:
import time
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
import pandas as pd

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and the RoBERTa model with 2 labels for binary classification
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

def load_data():
    train_data = pd.read_csv('/kaggle/input/balanced/balanced_comments_dataset.csv')
    test_data = pd.read_csv('/kaggle/input/testbalance/balanced_test_dataset.csv')
    train_data['comment'] = train_data['comment'].astype(str)
    test_data['comment'] = test_data['comment'].astype(str)
    return train_data, test_data

def tokenize_function(examples):
    return tokenizer(examples['comment'], padding='max_length', truncation=True)

def prepare_data(train_data, test_data, use_weighted_sampler=True):
    train_dataset = Dataset.from_pandas(train_data[['comment', 'toxic']])
    test_dataset = Dataset.from_pandas(test_data[['comment', 'toxic']])
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'toxic'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'toxic'])
    
    batch_size = 8
    if use_weighted_sampler:
        class_counts = train_data['toxic'].value_counts()
        sample_weights = [1.0 / class_counts[c] for c in train_data['toxic']]
        sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
    else:
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    return train_loader, test_loader

def train_model(config_name, use_mixed_precision=True, use_grad_accum=True, use_early_stopping=True, use_weighted_sampler=True):
    train_data, test_data = load_data()
    train_loader, test_loader = prepare_data(train_data, test_data, use_weighted_sampler)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_epochs = 5
    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    
    loss_fn = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler() if use_mixed_precision else None
    accumulation_steps = 4 if use_grad_accum else 1
    early_stopping_patience = 3 if use_early_stopping else None
    
    best_val_loss = float('inf')
    patience_counter = 0
    
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            
            if use_mixed_precision:
                with torch.cuda.amp.autocast():
                    outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['toxic'])
                    loss = outputs.loss / accumulation_steps
                scaler.scale(loss).backward()
            else:
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['toxic'])
                loss = outputs.loss / accumulation_steps
                loss.backward()
            
            if (i + 1) % accumulation_steps == 0 or (i + 1) == len(train_loader):
                if use_mixed_precision:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()
                lr_scheduler.step()
            
            total_train_loss += loss.item() * accumulation_steps
        
        avg_train_loss = total_train_loss / len(train_loader)
        model.eval()
        total_val_loss = 0
        all_preds, all_labels = [], []
        
        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['toxic'])
                total_val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
                labels = batch['toxic'].cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels)
        
        avg_val_loss = total_val_loss / len(test_loader)
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
        
        print(f"{config_name} | Epoch {epoch + 1}: Train Loss {avg_train_loss:.4f}, Val Loss {avg_val_loss:.4f}, Acc {accuracy:.4f}, F1 {f1:.4f}")
        
        if use_early_stopping and avg_val_loss >= best_val_loss:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                print(f"{config_name}: Early stopping triggered")
                break
        else:
            best_val_loss = avg_val_loss
            patience_counter = 0
    
    training_time = time.time() - start_time
    return accuracy, f1, training_time

def run_ablation_study():
    configs = [
        ("Full Model", True, True, True, True),
        ("Without Mixed Precision", False, True, True, True),
        ("Without Gradient Accumulation", True, False, True, True),
        ("Without Early Stopping", True, True, False, True),
        ("Basic Fine-tuning", False, False, False, False)
    ]
    
    results = []
    for config_name, mixed_precision, grad_accum, early_stopping, weighted_sampler in configs:
        acc, f1, train_time = train_model(config_name, mixed_precision, grad_accum, early_stopping, weighted_sampler)
        results.append((config_name, acc, f1, train_time))
    
    print("\nAblation Study Results")
    print("Configuration | Accuracy | F1 Score | Training Time")
    for res in results:
        print(f"{res[0]} | {res[1]*100:.2f}% | {res[2]:.2f} | {res[3]:.2f}x")

run_ablation_study()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler() if use_mixed_precision else None
  with torch.cuda.amp.autocast():


Full Model | Epoch 1: Train Loss 0.2414, Val Loss 0.2885, Acc 0.8890, F1 0.8982


  with torch.cuda.amp.autocast():


Full Model | Epoch 2: Train Loss 0.1544, Val Loss 0.2344, Acc 0.9039, F1 0.9098


  with torch.cuda.amp.autocast():


Full Model | Epoch 3: Train Loss 0.1319, Val Loss 0.3652, Acc 0.9017, F1 0.9087


  with torch.cuda.amp.autocast():


Full Model | Epoch 4: Train Loss 0.1148, Val Loss 0.3855, Acc 0.8865, F1 0.8966


  with torch.cuda.amp.autocast():


Full Model | Epoch 5: Train Loss 0.1018, Val Loss 0.2583, Acc 0.9056, F1 0.9098
Full Model: Early stopping triggered


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Without Mixed Precision | Epoch 1: Train Loss 0.2457, Val Loss 0.2917, Acc 0.8938, F1 0.9012
Without Mixed Precision | Epoch 2: Train Loss 0.1572, Val Loss 0.3448, Acc 0.8891, F1 0.8986
Without Mixed Precision | Epoch 3: Train Loss 0.1375, Val Loss 0.2852, Acc 0.8904, F1 0.8998
Without Mixed Precision | Epoch 4: Train Loss 0.1128, Val Loss 0.3900, Acc 0.8824, F1 0.8939
Without Mixed Precision | Epoch 5: Train Loss 0.1074, Val Loss 0.2823, Acc 0.8943, F1 0.9029


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler() if use_mixed_precision else None
  with torch.cuda.amp.autocast():


Without Gradient Accumulation | Epoch 1: Train Loss 0.1693, Val Loss 0.2999, Acc 0.8966, F1 0.9046


  with torch.cuda.amp.autocast():


Without Gradient Accumulation | Epoch 2: Train Loss 0.0884, Val Loss 0.3573, Acc 0.9048, F1 0.9110


  with torch.cuda.amp.autocast():


Without Gradient Accumulation | Epoch 3: Train Loss 0.0494, Val Loss 0.4343, Acc 0.9034, F1 0.9098


  with torch.cuda.amp.autocast():


Without Gradient Accumulation | Epoch 4: Train Loss 0.0315, Val Loss 0.4457, Acc 0.9006, F1 0.9078
Without Gradient Accumulation: Early stopping triggered


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler() if use_mixed_precision else None
  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 1: Train Loss 0.2578, Val Loss 0.3031, Acc 0.8885, F1 0.8971


  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 2: Train Loss 0.1618, Val Loss 0.2492, Acc 0.9001, F1 0.9063


  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 3: Train Loss 0.1356, Val Loss 0.3400, Acc 0.8881, F1 0.8980


  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 4: Train Loss 0.1272, Val Loss 0.3191, Acc 0.9018, F1 0.9086


  with torch.cuda.amp.autocast():


Without Early Stopping | Epoch 5: Train Loss 0.1062, Val Loss 0.2753, Acc 0.9037, F1 0.9061


Map:   0%|          | 0/19224 [00:00<?, ? examples/s]

Map:   0%|          | 0/12180 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Basic Fine-tuning | Epoch 1: Train Loss 0.1938, Val Loss 0.2699, Acc 0.9004, F1 0.9078
Basic Fine-tuning | Epoch 2: Train Loss 0.0986, Val Loss 0.3092, Acc 0.9085, F1 0.9142
Basic Fine-tuning | Epoch 3: Train Loss 0.0406, Val Loss 0.4220, Acc 0.9039, F1 0.9102
Basic Fine-tuning | Epoch 4: Train Loss 0.0157, Val Loss 0.5250, Acc 0.9060, F1 0.9116
Basic Fine-tuning | Epoch 5: Train Loss 0.0058, Val Loss 0.5835, Acc 0.9037, F1 0.9104

Ablation Study Results
Configuration | Accuracy | F1 Score | Training Time
Full Model | 90.56% | 0.91 | 4586.89x
Without Mixed Precision | 89.43% | 0.90 | 11578.11x
Without Gradient Accumulation | 90.06% | 0.91 | 3973.83x
Without Early Stopping | 90.37% | 0.91 | 4600.48x
Basic Fine-tuning | 90.37% | 0.91 | 11909.47x
