In [2]:
# Cell 1 ‚Äî installs (only if needed) and imports
!pip install -q transformers datasets lime tqdm

import os
import random
import math
import json
import time
from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW


from lime.lime_text import LimeTextExplainer

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)


Device: cpu


In [3]:
# Cell 2 ‚Äî file paths & hyperparams (edit paths if required)
DATA_FILE = "cyberbullying_tweets.csv"   # your dataset from Kaggle (andrewmvd)
GLOVE_FILE = "glove.6B.300d.txt"         # make sure this file is in working directory

# GloVe/token settings to reproduce ~25,000 dims
EMBED_DIM = 300
MAX_TOKENS_FOR_25K = 83   # 83*300 = 24,900
PAD_TO_25K = 25000        # we will pad to exactly 25,000 dims

# PCA target per paper
PCA_N_COMPONENTS = 9000

# Transformer backbone
ROBERTA_MODEL_NAME = "roberta-base"

# Training hyperparams (paper used fairly large batches / longer training,
# but these are reasonable defaults; you can increase if you have GPU memory)
BATCH_SIZE = 32
NUM_EPOCHS = 6
LR = 2e-5
WEIGHT_DECAY = 1e-2
WARMUP_STEPS = 0

# Cross-validation folds
KFOLD = 5

# Files to save
PCA_MODEL_PATH = "ipca_9000.npy"   # we'll save sklearn PCA components and mean manually
PCA_MEAN_PATH = "ipca_mean.npy"
ROBERTA_FINE_TUNED_PATH = "best_robertanet.pt"


In [4]:
# Cell 3 ‚Äî load dataset and preprocess text
assert os.path.exists(DATA_FILE), f"Dataset not found: {DATA_FILE}"
df = pd.read_csv(DATA_FILE)

# Inspect and adapt to your CSV columns
# Many Kaggle versions use columns like 'tweet_text' and 'cyberbullying_type' or 'label'
# Adapt these names if your file differs.
if "tweet_text" in df.columns:
    TEXT_COL = "tweet_text"
elif "text" in df.columns:
    TEXT_COL = "text"
elif "tweet" in df.columns:
    TEXT_COL = "tweet"
else:
    # fallback: pick first object dtype column
    TEXT_COL = df.select_dtypes(include=['object']).columns[0]

if "cyberbullying_type" in df.columns:
    LABEL_COL = "cyberbullying_type"
elif "label" in df.columns:
    LABEL_COL = "label"
else:
    LABEL_COL = df.select_dtypes(include=['int','object']).columns[1]

print("Using text col:", TEXT_COL, "label col:", LABEL_COL)
df = df[[TEXT_COL, LABEL_COL]].dropna().reset_index(drop=True)

# Basic cleaning function (paper used basic cleaning)
import re
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www\S+", "", t)
    t = re.sub(r"[^a-z0-9\s@#']", " ", t)  # keep hashtags/mentions if helpful
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["clean_text"] = df[TEXT_COL].astype(str).apply(clean_text)

# label encode
le = LabelEncoder()
df["label_enc"] = le.fit_transform(df[LABEL_COL].astype(str))
classes = list(le.classes_)
print("Classes:", classes)
print("Dataset size:", len(df))


Using text col: tweet_text label col: cyberbullying_type
Classes: ['age', 'ethnicity', 'gender', 'not_cyberbullying', 'other_cyberbullying', 'religion']
Dataset size: 47692


In [5]:
# Cell 4 ‚Äî load GloVe embeddings
assert os.path.exists(GLOVE_FILE), f"GloVe file not found: {GLOVE_FILE}"
embeddings_index = {}
with open(GLOVE_FILE, 'r', encoding='utf8') as f:
    for line in tqdm(f, desc="Loading GloVe"):
        parts = line.rstrip().split(" ")
        word = parts[0]
        vec = np.asarray(parts[1:], dtype=np.float32)
        embeddings_index[word] = vec
print("Loaded GloVe vectors:", len(embeddings_index))


Loading GloVe: 400000it [00:40, 9965.43it/s] 

Loaded GloVe vectors: 400000





In [6]:
# Cell 5 ‚Äî produce per-tweet GloVe vectors -> target 25,000 dims
MAX_TOKENS = MAX_TOKENS_FOR_25K  # = 83
EMBED_DIM = 300

def get_glove_flat_vector(text, max_tokens=MAX_TOKENS, pad_to=PAD_TO_25K):
    # take token-level glove vectors (first max_tokens tokens) and flatten
    words = text.split()[:max_tokens]
    vecs = []
    for w in words:
        v = embeddings_index.get(w)
        if v is None:
            # try simple normalization (strip punctuation)
            w2 = re.sub(r"[^a-z0-9]", "", w)
            v = embeddings_index.get(w2, np.zeros(EMBED_DIM, dtype=np.float32))
        vecs.append(v)
    # pad tokens if fewer than max_tokens
    while len(vecs) < max_tokens:
        vecs.append(np.zeros(EMBED_DIM, dtype=np.float32))
    flat = np.concatenate(vecs)  # shape = max_tokens * EMBED_DIM (24,900)
    # pad/truncate to exact 25,000 dims
    if flat.shape[0] < pad_to:
        pad_len = pad_to - flat.shape[0]
        flat = np.concatenate([flat, np.zeros(pad_len, dtype=np.float32)])
    elif flat.shape[0] > pad_to:
        flat = flat[:pad_to]
    return flat

# create X_glove (may take a bit of memory)
print("Generating 25,000-dim GloVe vectors for each tweet...")
X_glove = np.vstack([get_glove_flat_vector(t) for t in tqdm(df['clean_text'])])
y = df['label_enc'].values
print("X_glove shape:", X_glove.shape)


Generating 25,000-dim GloVe vectors for each tweet...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47692/47692 [00:12<00:00, 3839.45it/s]


X_glove shape: (47692, 25000)


In [9]:
# ‚úÖ Cell 6 ‚Äî Adaptive IncrementalPCA (auto RAM-safe)
import psutil
from sklearn.decomposition import PCA, IncrementalPCA
import gc

# --- Detect system memory ---
total_ram_gb = psutil.virtual_memory().total / (1024**3)
print(f"üß† Detected system RAM: {total_ram_gb:.2f} GB")

# --- Configuration ---
target_components = PCA_N_COMPONENTS  # 9000 per paper
n_samples, n_features = X_glove.shape

# Estimate memory for one full PCA matrix (float32)
approx_mem_gb = (n_samples * n_features * 4) / (1024**3)
print(f"Approx. dataset size in RAM: {approx_mem_gb:.2f} GB")

# --- Decide mode ---
if total_ram_gb > approx_mem_gb * 2.5:
    # plenty of RAM ‚Äî use full PCA
    print(f"‚úÖ Using full PCA with {target_components} components...")
    pca = PCA(n_components=target_components, random_state=42)
    X_pca = pca.fit_transform(X_glove)
    components, mean = pca.components_, pca.mean_
else:
    # limited RAM ‚Äî use IncrementalPCA safely
    print(f"‚öôÔ∏è Using IncrementalPCA (RAM-safe mode)...")

    # pick a batch size to be at least equal to n_components
    safe_batch = max(min(10000, n_samples), target_components)
    safe_batch = min(safe_batch, n_samples)
    n_comp_fit = min(target_components, safe_batch)

    print(f"‚Üí n_components_fit={n_comp_fit}, batch_size={safe_batch}")

    ipca = IncrementalPCA(n_components=n_comp_fit)
    for i in tqdm(range(0, n_samples, safe_batch), desc="Partial fitting PCA"):
        ipca.partial_fit(X_glove[i:i+safe_batch])

    # transform all data
    X_pca_parts = []
    for i in tqdm(range(0, n_samples, safe_batch), desc="Transforming batches"):
        X_pca_parts.append(ipca.transform(X_glove[i:i+safe_batch]))
        gc.collect()
    X_pca = np.vstack(X_pca_parts).astype(np.float32)
    del X_pca_parts
    gc.collect()

    components, mean = ipca.components_, ipca.mean_

# --- Save PCA results ---
np.save("ipca_components.npy", components)
np.save("ipca_mean.npy", mean)
print("‚úÖ PCA complete. X_pca shape:", X_pca.shape)


üß† Detected system RAM: 7.65 GB
Approx. dataset size in RAM: 4.44 GB
‚öôÔ∏è Using IncrementalPCA (RAM-safe mode)...
‚Üí n_components_fit=9000, batch_size=10000


Partial fitting PCA:  20%|‚ñà‚ñà        | 1/5 [11:24<45:39, 684.94s/it]


MemoryError: Unable to allocate 3.54 GiB for an array with shape (19001, 25000) and data type float64

In [None]:
# Cell 7 ‚Äî Tokenizer + RoBERTa backbone + classifier that concatenates PCA features
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL_NAME)
roberta = AutoModel.from_pretrained(ROBERTA_MODEL_NAME).to(DEVICE)
roberta.config.output_hidden_states = False

# Classifier that concatenates RoBERTa pooled output (768) with PCA features (9000)
class RobertaPCAClassifier(nn.Module):
    def __init__(self, pca_dim, hidden_dim=1024, num_classes=len(classes), roberta_model=roberta):
        super().__init__()
        self.roberta = roberta_model  # pretrained backbone
        # freeze/unfreeze policy: paper fine-tunes ‚Äî we will fine-tune full model
        # If memory/time limited, you can freeze `self.roberta` layers and only train head.
        self.pca_proj = nn.Linear(pca_dim, hidden_dim)  # project PCA features
        self.classifier = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)
        )
    def forward(self, input_ids=None, attention_mask=None, pca_feats=None):
        # roberta forward
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        # pooled representation: use mean pooling of last hidden state across tokens
        last_hidden = outputs.last_hidden_state  # (B, T, H)
        pooled = last_hidden.mean(dim=1)  # (B, H)
        pca_proj = F.relu(self.pca_proj(pca_feats))  # (B, hidden_dim)
        fused = torch.cat([pooled, pca_proj], dim=1)
        logits = self.classifier(fused)
        return logits

# initialize model
pca_dim = PCA_N_COMPONENTS
model = RobertaPCAClassifier(pca_dim=pca_dim).to(DEVICE)
print(model)


In [None]:
# ‚úÖ Cell 7A ‚Äî Machine Learning baselines on PCA features (RF, SVM, NB, KNN)

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Training classical ML models on PCA (9,000) features...")

# scale features for distance-based models
scaler = StandardScaler()
X_pca_scaled = scaler.fit_transform(X_pca)

# 80/20 split for baseline testing
from sklearn.model_selection import train_test_split
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(
    X_pca_scaled, y, test_size=0.2, random_state=42, stratify=y
)

ml_models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "LinearSVM": LinearSVC(C=1.0, max_iter=5000, random_state=42),
    "NaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

ml_results = []
for name, clf in ml_models.items():
    print(f"‚ñ∂ Training {name} ...")
    clf.fit(X_train_ml, y_train_ml)
    preds = clf.predict(X_test_ml)
    acc = accuracy_score(y_test_ml, preds)
    prec = precision_score(y_test_ml, preds, average='macro', zero_division=0)
    rec = recall_score(y_test_ml, preds, average='macro', zero_division=0)
    f1 = f1_score(y_test_ml, preds, average='macro', zero_division=0)
    ml_results.append([name, acc, prec, rec, f1])
    print(f"{name}: Acc={acc:.4f} | F1={f1:.4f}")

ml_df = pd.DataFrame(ml_results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])
display(ml_df)


In [None]:
# ‚úÖ Cell 7B ‚Äî Deep Learning baselines (CNN, BiLSTM, ConvLSTM)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# reshape GloVe features to (samples, seq_len, embed_dim) = (N, 30, 300)
SEQ_LEN, EMBED_DIM = 30, 300
X_glove_reshaped = X_glove[:, :SEQ_LEN*EMBED_DIM].reshape(-1, SEQ_LEN, EMBED_DIM)

X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(
    X_glove_reshaped, y, test_size=0.2, random_state=42, stratify=y
)

train_ds = TensorDataset(torch.tensor(X_train_dl, dtype=torch.float32), torch.tensor(y_train_dl))
test_ds  = TensorDataset(torch.tensor(X_test_dl,  dtype=torch.float32), torch.tensor(y_test_dl))

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=64, shuffle=False)

# Simple CNN baseline
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=len(np.unique(y))):
        super().__init__()
        self.conv1 = nn.Conv1d(EMBED_DIM, 128, 3, padding=1)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(128, num_classes)
    def forward(self, x):
        x = x.permute(0,2,1)  # (B, 300, 30)
        x = F.relu(self.conv1(x))
        x = self.pool(x).squeeze(-1)
        return self.fc(x)

# BiLSTM baseline
class SimpleBiLSTM(nn.Module):
    def __init__(self, hidden=128, num_classes=len(np.unique(y))):
        super().__init__()
        self.lstm = nn.LSTM(EMBED_DIM, hidden, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden*2, num_classes)
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:,-1,:])

# ConvLSTM hybrid baseline
class ConvLSTM(nn.Module):
    def __init__(self, hidden=128, num_classes=len(np.unique(y))):
        super().__init__()
        self.conv = nn.Conv1d(EMBED_DIM, 128, 3, padding=1)
        self.lstm = nn.LSTM(128, hidden, batch_first=True)
        self.fc = nn.Linear(hidden, num_classes)
    def forward(self, x):
        x = x.permute(0,2,1)
        x = F.relu(self.conv(x)).permute(0,2,1)
        out, _ = self.lstm(x)
        return self.fc(out[:,-1,:])

def train_and_eval(model, loader_train, loader_test, epochs=5):
    model = model.to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()
    for ep in range(epochs):
        model.train()
        for xb, yb in loader_train:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward()
            opt.step()
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in loader_test:
            out = model(xb.to(DEVICE))
            preds += torch.argmax(out, dim=1).cpu().tolist()
            trues += yb.tolist()
    return (
        accuracy_score(trues, preds),
        precision_score(trues, preds, average='macro', zero_division=0),
        recall_score(trues, preds, average='macro', zero_division=0),
        f1_score(trues, preds, average='macro', zero_division=0),
    )

# Train each DL model
cnn_metrics = train_and_eval(SimpleCNN(), train_loader, test_loader)
bilstm_metrics = train_and_eval(SimpleBiLSTM(), train_loader, test_loader)
convlstm_metrics = train_and_eval(ConvLSTM(), train_loader, test_loader)

dl_df = pd.DataFrame([
    ['CNN', *cnn_metrics],
    ['BiLSTM', *bilstm_metrics],
    ['ConvLSTM', *convlstm_metrics]
], columns=['Model','Accuracy','Precision','Recall','F1'])

print("‚úÖ Deep Learning baselines complete:")
display(dl_df)


In [None]:
# Cell 7C ‚Äî BERT baseline (fine-tune bert-base-uncased, 5-fold CV)
from transformers import AutoTokenizer, AutoModel

BERT_MODEL_NAME = "bert-base-uncased"
print("Preparing BERT baseline:", BERT_MODEL_NAME)

tokenizer_bert = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
bert_backbone = AutoModel.from_pretrained(BERT_MODEL_NAME).to(DEVICE)

class BertClassifier(nn.Module):
    def __init__(self, num_classes=len(classes), backbone=bert_backbone, hidden_dim=256):
        super().__init__()
        self.backbone = backbone
        self.fc = nn.Sequential(
            nn.Linear(self.backbone.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)
        )
    def forward(self, input_ids=None, attention_mask=None, pca_feats=None):
        # ignore pca_feats (BERT baseline uses text only)
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        pooled = outputs.last_hidden_state.mean(dim=1)
        logits = self.fc(pooled)
        return logits

# 5-fold CV for BERT (text-only)
skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
bert_fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df['clean_text'], y), 1):
    print(f"\n>>> BERT Fold {fold}/{KFOLD}")
    train_texts = df.loc[train_idx, 'clean_text'].tolist()
    val_texts = df.loc[val_idx, 'clean_text'].tolist()
    y_train = y[train_idx]
    y_val = y[val_idx]
    # datasets & loaders
    train_ds = RobertaPCADataset(train_texts, np.zeros((len(train_texts), pca_dim)), y_train, tokenizer_bert)  # pca_feats ignored
    val_ds   = RobertaPCADataset(val_texts,   np.zeros((len(val_texts),   pca_dim)), y_val,   tokenizer_bert)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    model_bert = BertClassifier(num_classes=len(classes)).to(DEVICE)
    optimizer = AdamW(model_bert.parameters(), lr=2e-5, weight_decay=1e-2)
    total_steps = len(train_loader)*NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

    best_val_f1 = 0.0
    for epoch in range(1, NUM_EPOCHS+1):
        _ = train_one_epoch(model_bert, train_loader, optimizer, scheduler)
        stats = evaluate_model(model_bert, val_loader)
        print(f"BERT Fold{fold} E{epoch} ‚Äî Val F1: {stats['f1']:.4f} Acc: {stats['accuracy']:.4f}")
        if stats['f1'] > best_val_f1:
            best_val_f1 = stats['f1']
            torch.save(model_bert.state_dict(), f"best_bert_fold{fold}.pt")
    model_bert.load_state_dict(torch.load(f"best_bert_fold{fold}.pt"))
    final_stats = evaluate_model(model_bert, val_loader)
    print("BERT Fold final:", final_stats)
    bert_fold_metrics.append(final_stats)

# Aggregate BERT results
bert_acc = np.mean([m['accuracy'] for m in bert_fold_metrics])
bert_prec = np.mean([m['precision'] for m in bert_fold_metrics])
bert_rec = np.mean([m['recall'] for m in bert_fold_metrics])
bert_f1 = np.mean([m['f1'] for m in bert_fold_metrics])
print("\nBERT 5-fold mean ‚Äî Acc: {:.4f} Prec: {:.4f} Rec: {:.4f} F1: {:.4f}".format(bert_acc, bert_prec, bert_rec, bert_f1))


In [None]:
# Cell 7D ‚Äî RoBERTa-only baseline (no PCA fusion), 5-fold CV
print("Preparing RoBERTa-only baseline (text-only)")

tokenizer_roberta_text = AutoTokenizer.from_pretrained(ROBERTA_MODEL_NAME)
roberta_backbone_text = AutoModel.from_pretrained(ROBERTA_MODEL_NAME).to(DEVICE)

class RobertaTextClassifier(nn.Module):
    def __init__(self, num_classes=len(classes), backbone=roberta_backbone_text, hidden_dim=256):
        super().__init__()
        self.backbone = backbone
        self.fc = nn.Sequential(
            nn.Linear(self.backbone.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)
        )
    def forward(self, input_ids=None, attention_mask=None, pca_feats=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        pooled = outputs.last_hidden_state.mean(dim=1)
        logits = self.fc(pooled)
        return logits

roberta_text_fold_metrics = []
for fold, (train_idx, val_idx) in enumerate(skf.split(df['clean_text'], y), 1):
    print(f"\n>>> RoBERTa-text Fold {fold}/{KFOLD}")
    train_texts = df.loc[train_idx, 'clean_text'].tolist()
    val_texts = df.loc[val_idx, 'clean_text'].tolist()
    y_train = y[train_idx]
    y_val = y[val_idx]

    train_ds = RobertaPCADataset(train_texts, np.zeros((len(train_texts), pca_dim)), y_train, tokenizer_roberta_text)
    val_ds   = RobertaPCADataset(val_texts,   np.zeros((len(val_texts),   pca_dim)), y_val,   tokenizer_roberta_text)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    model_rb_text = RobertaTextClassifier(num_classes=len(classes)).to(DEVICE)
    optimizer = AdamW(model_rb_text.parameters(), lr=2e-5, weight_decay=1e-2)
    total_steps = len(train_loader)*NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

    best_val_f1 = 0.0
    for epoch in range(1, NUM_EPOCHS+1):
        _ = train_one_epoch(model_rb_text, train_loader, optimizer, scheduler)
        stats = evaluate_model(model_rb_text, val_loader)
        print(f"RoBERTa-text Fold{fold} E{epoch} ‚Äî Val F1: {stats['f1']:.4f} Acc: {stats['accuracy']:.4f}")
        if stats['f1'] > best_val_f1:
            best_val_f1 = stats['f1']
            torch.save(model_rb_text.state_dict(), f"best_roberta_text_fold{fold}.pt")
    model_rb_text.load_state_dict(torch.load(f"best_roberta_text_fold{fold}.pt"))
    final_stats = evaluate_model(model_rb_text, val_loader)
    print("RoBERTa-text Fold final:", final_stats)
    roberta_text_fold_metrics.append(final_stats)

# Aggregate RoBERTa-only results
roberta_text_acc = np.mean([m['accuracy'] for m in roberta_text_fold_metrics])
roberta_text_prec = np.mean([m['precision'] for m in roberta_text_fold_metrics])
roberta_text_rec = np.mean([m['recall'] for m in roberta_text_fold_metrics])
roberta_text_f1 = np.mean([m['f1'] for m in roberta_text_fold_metrics])
print("\nRoBERTa-text 5-fold mean ‚Äî Acc: {:.4f} Prec: {:.4f} Rec: {:.4f} F1: {:.4f}".format(
    roberta_text_acc, roberta_text_prec, roberta_text_rec, roberta_text_f1))


In [None]:
# Cell 8 ‚Äî dataset + dataloader helpers
class RobertaPCADataset(Dataset):
    def __init__(self, texts, pca_feats, labels, tokenizer, max_length=128):
        self.texts = texts
        self.pca_feats = pca_feats.astype(np.float32)
        self.labels = labels.astype(np.int64)
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        enc = self.tokenizer(text,
                              truncation=True,
                              max_length=self.max_length,
                              padding='max_length',
                              return_tensors='pt')
        item = {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'pca_feats': torch.from_numpy(self.pca_feats[idx]),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item

def collate_fn(batch):
    input_ids = torch.stack([b['input_ids'] for b in batch])
    attention_mask = torch.stack([b['attention_mask'] for b in batch])
    pca_feats = torch.stack([b['pca_feats'] for b in batch])
    labels = torch.stack([b['labels'] for b in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'pca_feats': pca_feats, 'labels': labels}


In [None]:
# Cell 9 ‚Äî training & evaluation functions
from tqdm.auto import tqdm

loss_fn = nn.CrossEntropyLoss()

def train_one_epoch(model, dataloader, optimizer, scheduler=None, clip_grad=1.0):
    model.train()
    total_loss = 0.0
    pbar = tqdm(dataloader, desc="Train batch")
    for batch in pbar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        pca_feats = batch['pca_feats'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        logits = model(input_ids=input_ids, attention_mask=attention_mask, pca_feats=pca_feats)
        loss = loss_fn(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        total_loss += loss.item() * input_ids.size(0)
    return total_loss / len(dataloader.dataset)

def evaluate_model(model, dataloader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            pca_feats = batch['pca_feats'].to(DEVICE)
            labels = batch['labels'].cpu().numpy()

            logits = model(input_ids=input_ids, attention_mask=attention_mask, pca_feats=pca_feats)
            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
            preds.extend(batch_preds.tolist())
            trues.extend(labels.tolist())
    acc = accuracy_score(trues, preds)
    prec = precision_score(trues, preds, average='macro', zero_division=0)
    rec = recall_score(trues, preds, average='macro', zero_division=0)
    f1 = f1_score(trues, preds, average='macro', zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "preds": preds, "trues": trues}


In [None]:
# Cell 10 ‚Äî 5-fold cross validation running fine-tuning and reporting
skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)

fold_metrics = []
for fold, (train_idx, val_idx) in enumerate(skf.split(df['clean_text'], y), 1):
    print(f"\n===== Fold {fold}/{KFOLD} =====")

    # create datasets
    train_texts = df.loc[train_idx, 'clean_text'].tolist()
    val_texts = df.loc[val_idx, 'clean_text'].tolist()
    X_train_pca = X_pca[train_idx]
    X_val_pca = X_pca[val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]

    train_ds = RobertaPCADataset(train_texts, X_train_pca, y_train, tokenizer)
    val_ds = RobertaPCADataset(val_texts, X_val_pca, y_val, tokenizer)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    # initialize a new model instance for each fold (fresh weights)
    model = RobertaPCAClassifier(pca_dim=pca_dim).to(DEVICE)

    # optimizer & scheduler (fine-tune whole model)
    optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    total_steps = len(train_loader) * NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

    best_val_f1 = 0.0
    for epoch in range(1, NUM_EPOCHS + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, scheduler)
        val_stats = evaluate_model(model, val_loader)
        print(f"Fold {fold} Epoch {epoch}/{NUM_EPOCHS} ‚Äî TrainLoss: {train_loss:.4f} Val F1: {val_stats['f1']:.4f} Val Acc: {val_stats['accuracy']:.4f}")
        # save best model for fold
        if val_stats['f1'] > best_val_f1:
            best_val_f1 = val_stats['f1']
            torch.save(model.state_dict(), f"best_robertanet_fold{fold}.pt")
    # load best and evaluate (on validation)
    model.load_state_dict(torch.load(f"best_robertanet_fold{fold}.pt"))
    val_stats_final = evaluate_model(model, val_loader)
    print("Fold final metrics:", val_stats_final)
    fold_metrics.append(val_stats_final)

# aggregate and print mean ¬± std
accs = [m['accuracy'] for m in fold_metrics]
precs = [m['precision'] for m in fold_metrics]
recs = [m['recall'] for m in fold_metrics]
f1s = [m['f1'] for m in fold_metrics]
print("\n=== 5-Fold CV Results ===")
print(f"Accuracy : {np.mean(accs):.4f} ¬± {np.std(accs):.4f}")
print(f"Precision: {np.mean(precs):.4f} ¬± {np.std(precs):.4f}")
print(f"Recall   : {np.mean(recs):.4f} ¬± {np.std(recs):.4f}")
print(f"F1-score : {np.mean(f1s):.4f} ¬± {np.std(f1s):.4f}")


In [None]:
# Cell 11 ‚Äî if you have separate test set, evaluate final saved best model, otherwise
# train on full dataset and save final model (paper reports final test; here we show training on full data)
full_ds = RobertaPCADataset(df['clean_text'].tolist(), X_pca, y, tokenizer)
full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# Train final model on full dataset (if you want) ‚Äî careful, this is optional and expensive
final_model = RobertaPCAClassifier(pca_dim=pca_dim).to(DEVICE)
optimizer = AdamW(final_model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=len(full_loader)*NUM_EPOCHS)

best_f1 = 0.0
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_one_epoch(final_model, full_loader, optimizer, scheduler)
    # no validation here unless you have a held-out test set
    print(f"Epoch {epoch}/{NUM_EPOCHS} | Train Loss: {train_loss:.4f}")
    # optionally save each epoch
    if (epoch % 2) == 0:
        torch.save(final_model.state_dict(), f"final_robertanet_epoch{epoch}.pt")

# Save final model
torch.save(final_model.state_dict(), ROBERTA_FINE_TUNED_PATH)
print("Final model saved to", ROBERTA_FINE_TUNED_PATH)


In [None]:
# Cell 12 ‚Äî LIME: define predict_proba wrapper that accepts raw texts and returns class probabilities
# Load PCA components & mean (we saved earlier)
ipca_components = np.load("ipca_components.npy")
ipca_mean = np.load("ipca_mean.npy")

# recreate IncrementalPCA object for transform (we'll use sklearn PCA transform formula)
def glove_text_to_pca_vector(text):
    # replicate get_glove_flat_vector then center and project: x_pca = (x - mean) dot components_.T
    xflat = get_glove_flat_vector(clean_text(text))
    x_centered = xflat - ipca_mean
    # components shape (n_components, n_features) => project:
    x_pca = np.dot(x_centered, ipca_components.T)  # shape (n_components,)
    return x_pca.astype(np.float32)

# load the final model (if not loaded)
model = RobertaPCAClassifier(pca_dim=pca_dim).to(DEVICE)
if os.path.exists(ROBERTA_FINE_TUNED_PATH):
    model.load_state_dict(torch.load(ROBERTA_FINE_TUNED_PATH, map_location=DEVICE))
model.eval()

def predict_proba_from_texts(texts):
    # texts: list of strings -> returns array (n_texts, n_classes) of probabilities
    batch_inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = batch_inputs['input_ids'].to(DEVICE)
    attention_mask = batch_inputs['attention_mask'].to(DEVICE)
    # compute PCA features for each text (on CPU then move to device)
    pca_feats = np.vstack([glove_text_to_pca_vector(t) for t in texts])
    pca_feats = torch.from_numpy(pca_feats).to(DEVICE)
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask, pca_feats=pca_feats)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
    return probs

# LIME explainer
explainer = LimeTextExplainer(class_names=classes)


In [None]:
# Cell 13 ‚Äî generate LIME explanation for sample tweet(s)
examples = df['clean_text'].sample(3, random_state=SEED).tolist()

for i, text in enumerate(examples):
    print(f"\n--- Example {i+1} ---")
    print(text)
    probs = predict_proba_from_texts([text])[0]
    pred_label = classes[np.argmax(probs)]
    print("Predicted:", pred_label, "Probs:", np.round(probs, 3))

    exp = explainer.explain_instance(text, predict_proba_from_texts, num_features=10)
    print("LIME explanation (word, weight):")
    print(exp.as_list())
    # optional: visual
    fig = exp.as_pyplot_figure()
    fig.suptitle(f"LIME ‚Äî predicted: {pred_label}")


In [None]:
# Cell 14 (UPDATED) ‚Äî Aggregate ML, DL, BERT, RoBERTa (text), and Proposed RoBERTa+PCA results

# Prepare transformer rows
try:
    bert_row = ['BERT (text-only)', bert_acc, bert_prec, bert_rec, bert_f1]
except NameError:
    bert_row = ['BERT (text-only)', 0,0,0,0]

try:
    roberta_row = ['RoBERTa (text-only)', roberta_text_acc, roberta_text_prec, roberta_text_rec, roberta_text_f1]
except NameError:
    roberta_row = ['RoBERTa (text-only)', 0,0,0,0]

try:
    roberta_pca_row = ['RoBERTa + PCA-GloVe', np.mean([m['accuracy'] for m in fold_metrics]),
                       np.mean([m['precision'] for m in fold_metrics]),
                       np.mean([m['recall'] for m in fold_metrics]),
                       np.mean([m['f1'] for m in fold_metrics])]
except Exception:
    roberta_pca_row = ['RoBERTa + PCA-GloVe', 0,0,0,0]

transformer_df = pd.DataFrame([bert_row, roberta_row, roberta_pca_row],
                              columns=['Model','Accuracy','Precision','Recall','F1'])

# Combine all
all_results = pd.concat([ml_df, dl_df, transformer_df], ignore_index=True)
all_results = all_results.sort_values(by='F1', ascending=False).reset_index(drop=True)
print("üìä Final Performance Comparison (Paper-style Table):")
display(all_results.style.set_properties(**{'text-align': 'center'}))
