In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score
from tqdm.auto import tqdm
import json

import warnings
warnings.filterwarnings('ignore')


In [None]:
# device 
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")

In [None]:
# config
LABEL_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# data
TRAIN_PATH = '../data/train.csv'
TEST_PATH = '../data/test_1.csv'
RANDOM_SEED = 42

# model
MODEL_NAME = 'distilroberta-base'
MAX_LENGTH = 128
HIDDEN_SIZE = 768
CLASSIFIER_HIDDEN = 256
DROPOUT = 0.1
NUM_LABELS = 6

# training
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 5e-5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
MAX_GRAD_NORM = 1.0

# inference
THRESHOLD = 0.5

# paths
MODEL_SAVE_PATH = '../models/bert.pth'
RESULTS_PATH = '../results/bert_results.json'


In [None]:
# load data
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f"Train: {len(train_df):,} | Test: {len(test_df):,}")
print(f"Label distribution (train):")
print(train_df[LABEL_COLS].sum())


In [None]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")


In [None]:
# dataset class
class BertDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=MAX_LENGTH):
        self.texts = df['comment_text'].values
        self.labels = df[LABEL_COLS].values.astype(np.float32)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels, dtype=torch.float32)
        }

train_dataset = BertDataset(train_df, tokenizer)
test_dataset = BertDataset(test_df, tokenizer)

print(f"Datasets created: Train={len(train_dataset)}, Test={len(test_dataset)}")


In [None]:
# model class
class ToxicClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(MODEL_NAME)
        self.classifier = nn.Sequential(
            nn.Linear(HIDDEN_SIZE, CLASSIFIER_HIDDEN),
            nn.ReLU(),
            nn.Dropout(DROPOUT),
            nn.Linear(CLASSIFIER_HIDDEN, NUM_LABELS)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

model = ToxicClassifier().to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


In [None]:
# training setup
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

total_steps = len(train_loader) * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
criterion = nn.BCEWithLogitsLoss()

print(f"Total steps: {total_steps} | Warmup steps: {warmup_steps}")

In [None]:
# evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            probs = torch.sigmoid(logits).cpu().numpy()
            all_preds.append(probs)
            all_labels.append(labels.cpu().numpy())
    
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    
    preds_binary = (all_preds >= THRESHOLD).astype(int)
    
    # macro metrics
    macro_precision = precision_score(all_labels, preds_binary, average='macro', zero_division=0)
    macro_recall = recall_score(all_labels, preds_binary, average='macro', zero_division=0)
    macro_f1 = f1_score(all_labels, preds_binary, average='macro', zero_division=0)
    
    # AUC-PR (average precision) per label
    per_label_auc_pr = []
    for i in range(NUM_LABELS):
        try:
            auc_pr = average_precision_score(all_labels[:, i], all_preds[:, i])
            per_label_auc_pr.append(auc_pr)
        except:
            per_label_auc_pr.append(0.0)
    
    avg_loss = total_loss / len(dataloader)
    macro_auc_pr = np.mean(per_label_auc_pr)
    
    return {
        'loss': avg_loss,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1,
        'macro_auc_pr': macro_auc_pr,
        'per_label_auc_pr': per_label_auc_pr
    }


In [None]:
# training loop (no val - train on full data)
print(f"Starting training on full training data...\n")

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Training")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
    
    avg_train_loss = train_loss / len(train_loader)
    print(f"Train Loss: {avg_train_loss:.4f}\n")

# save final model
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"Training complete! Model saved to {MODEL_SAVE_PATH}")


In [None]:
# evaluate on test set
test_metrics = evaluate(model, test_loader, device)

print("=" * 50)
print("TEST SET RESULTS")
print("=" * 50)
print(f"Macro Precision: {test_metrics['macro_precision']:.3f}")
print(f"Macro Recall:    {test_metrics['macro_recall']:.3f}")
print(f"Macro F1:        {test_metrics['macro_f1']:.3f}")
print(f"Macro AUC-PR:    {test_metrics['macro_auc_pr']:.3f}")
print("\nPer-label AUC-PR:")
for i, label in enumerate(LABEL_COLS):
    print(f"  {label:15s}: {test_metrics['per_label_auc_pr'][i]:.3f}")


In [None]:
# save results
results = {
    'macro_precision': round(test_metrics['macro_precision'], 3),
    'macro_recall': round(test_metrics['macro_recall'], 3),
    'macro_f1': round(test_metrics['macro_f1'], 3),
    'macro_auc_pr': round(test_metrics['macro_auc_pr'], 3),
    'per_label_auc_pr': {
        label: round(test_metrics['per_label_auc_pr'][i], 3) 
        for i, label in enumerate(LABEL_COLS)
    }
}

with open(RESULTS_PATH, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Results saved to {RESULTS_PATH}")
