In [1]:
!pip install --quiet torch scikit-learn pandas numpy nltk lime tqdm


In [2]:
import os
import re
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from lime.lime_text import LimeTextExplainer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish'\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# File + column names (you confirmed these)
DATA_FILE = "cyberbullying_tweets.csv"
TEXT_COL = "tweet_text"
LABEL_COL = "cyberbullying_type"

# GloVe file path (download and extract glove.6B.300d.txt)
GLOVE_FILE = "glove.6B.300d.txt"

# PCA target (paper uses 9000). If your machine can't handle it, change to lower (e.g., 300, 1000).
PCA_N_COMPONENTS = 9000

# Training hyperparameters
BATCH_SIZE = 64
NUM_EPOCHS = 10
LR = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Device:", DEVICE)


Device: cpu


In [4]:
assert os.path.exists(DATA_FILE), f"Dataset file not found: {DATA_FILE}"
df = pd.read_csv(DATA_FILE)
print("Loaded dataset:", df.shape)
if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
    raise KeyError(f"Expected columns '{TEXT_COL}' and '{LABEL_COL}' in {DATA_FILE}. Found: {list(df.columns)}")
df = df[[TEXT_COL, LABEL_COL]].dropna().reset_index(drop=True)
print("After dropping NA:", df.shape)


Loaded dataset: (47692, 2)
After dropping NA: (47692, 2)


In [5]:
stop = set(stopwords.words('english'))
def clean_text(t):
    t = str(t)
    t = re.sub(r"http\S+|www\.\S+", "", t)
    t = re.sub(r"@\w+|#\w+", "", t)
    t = re.sub(r"[^a-zA-Z\s]", " ", t)
    t = t.lower()
    t = " ".join([w for w in t.split() if w and w not in stop])
    return t

df['clean_text'] = df[TEXT_COL].apply(clean_text)
print("Sample cleaned text:")
print(df['clean_text'].head(3).to_list())


Sample cleaned text:
['words food crapilicious', 'white', 'classy whore red velvet cupcakes']


In [6]:
assert os.path.exists(GLOVE_FILE), f"GloVe file not found: {GLOVE_FILE}. Download and extract glove.6B.300d.txt."

embeddings_index = {}
with open(GLOVE_FILE, 'r', encoding='utf8') as f:
    for line in tqdm(f, desc="Loading GloVe"):
        parts = line.rstrip().split(" ")
        word = parts[0]
        vec = np.asarray(parts[1:], dtype=np.float32)
        embeddings_index[word] = vec
print("Loaded GloVe vectors:", len(embeddings_index))


Loading GloVe: 0it [00:00, ?it/s]

Loading GloVe: 400000it [00:41, 9630.14it/s] 

Loaded GloVe vectors: 400000





In [7]:
def get_glove_vector(text):
    words = text.split()
    if not words:
        return np.zeros(300, dtype=np.float32)
    vecs = [embeddings_index.get(w) for w in words]
    # replace missing with zeros
    vecs = [v if v is not None else np.zeros(300, dtype=np.float32) for v in vecs]
    return np.mean(vecs, axis=0)

# generate features (this may take a moment)
X_glove = np.vstack([get_glove_vector(t) for t in tqdm(df['clean_text'], desc="GloVe transform")])
print("X_glove shape:", X_glove.shape)


GloVe transform:   0%|          | 0/47692 [00:00<?, ?it/s]

GloVe transform: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47692/47692 [00:03<00:00, 14220.01it/s]

X_glove shape: (47692, 300)





In [8]:
# Try to run PCA with the paper's component count; fallback if memory error
try:
    print(f"Attempting PCA with n_components={PCA_N_COMPONENTS} ...")
    pca = PCA(n_components=PCA_N_COMPONENTS, random_state=42)
    X_pca = pca.fit_transform(X_glove)
    print("PCA done. Shape:", X_pca.shape)
except Exception as e:
    print("PCA with requested components failed:", str(e))
    # fallback to smaller PCA (300)
    fallback = 300
    print(f"Falling back to PCA n_components={fallback}")
    pca = PCA(n_components=fallback, random_state=42)
    X_pca = pca.fit_transform(X_glove)
    print("PCA done (fallback). Shape:", X_pca.shape)

# Keep for later
print("Explained variance ratio sum:", X_pca.shape, np.sum(pca.explained_variance_ratio_))


Attempting PCA with n_components=9000 ...
PCA with requested components failed: n_components=9000 must be between 0 and min(n_samples, n_features)=300 with svd_solver='covariance_eigh'
Falling back to PCA n_components=300
PCA done (fallback). Shape: (47692, 300)
Explained variance ratio sum: (47692, 300) 0.99999994
PCA done (fallback). Shape: (47692, 300)
Explained variance ratio sum: (47692, 300) 0.99999994


In [9]:
# encode labels
le = LabelEncoder()
y = le.fit_transform(df[LABEL_COL].values)
classes = le.classes_
print("Classes:", classes, "n_classes:", len(classes))

# train/test split (80/20)
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y)

print("Train/Test shapes:", X_train_np.shape, X_test_np.shape)
# convert to torch tensors for DL models
X_train = torch.tensor(X_train_np, dtype=torch.float32)
X_test  = torch.tensor(X_test_np, dtype=torch.float32)
y_train = torch.tensor(y_train_np, dtype=torch.long)
y_test  = torch.tensor(y_test_np, dtype=torch.long)


Classes: ['age' 'ethnicity' 'gender' 'not_cyberbullying' 'other_cyberbullying'
 'religion'] n_classes: 6
Train/Test shapes: (38153, 300) (9539, 300)


In [10]:
import torch
import torch.nn as nn

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class RoBERTaNetLike(nn.Module):
    def __init__(self, input_dim, num_classes, n_heads=8, ff_dim=512, dropout=0.3):
        super().__init__()
        
        # Project input to a dimension divisible by n_heads
        self.embed_dim = n_heads * ((input_dim + n_heads - 1) // n_heads)  # next multiple of n_heads
        self.input_proj = nn.Linear(input_dim, self.embed_dim)
        
        self.mha = nn.MultiheadAttention(embed_dim=self.embed_dim, num_heads=n_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(self.embed_dim)
        
        self.ff = nn.Sequential(
            nn.Linear(self.embed_dim, ff_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, self.embed_dim)
        )
        self.norm2 = nn.LayerNorm(self.embed_dim)
        
        self.classifier = nn.Sequential(
            nn.Linear(self.embed_dim, ff_dim//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim//2, num_classes)
        )
    
    def forward(self, x):
        # x: (batch, input_dim)
        x = self.input_proj(x)  # project to embed_dim
        x_seq = x.unsqueeze(1)  # (batch, seq_len=1, embed_dim)
        attn_out, _ = self.mha(x_seq, x_seq, x_seq)
        attn_out = attn_out.squeeze(1)
        x = self.norm1(x + attn_out)
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        logits = self.classifier(x)
        return logits

# Initialize model with actual dimensions from our data
num_classes = len(le.classes_)
input_dim = X_train.shape[1]  # this will be our PCA dimension
print(f"Creating model with input_dim={input_dim}, num_classes={num_classes}")

model = RoBERTaNetLike(input_dim=input_dim, num_classes=num_classes).to(DEVICE)
print(model)

# Quick forward pass test with a small batch from our data
X_batch = X_train[:2].to(DEVICE)  # just use 2 examples to verify
logits = model(X_batch)
print("Logits shape:", logits.shape)  # should be (batch_size, num_classes)")

Creating model with input_dim=300, num_classes=6
RoBERTaNetLike(
  (input_proj): Linear(in_features=300, out_features=304, bias=True)
  (mha): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=304, out_features=304, bias=True)
  )
  (norm1): LayerNorm((304,), eps=1e-05, elementwise_affine=True)
  (ff): Sequential(
    (0): Linear(in_features=304, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=512, out_features=304, bias=True)
  )
  (norm2): LayerNorm((304,), eps=1e-05, elementwise_affine=True)
  (classifier): Sequential(
    (0): Linear(in_features=304, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=6, bias=True)
  )
)
Logits shape: torch.Size([2, 6])
Logits shape: torch.Size([2, 6])


In [11]:
# # Training setup
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
# num_epochs = 10
# batch_size = 64

# train_dataset = torch.utils.data.TensorDataset(
#     torch.FloatTensor(X_train),
#     torch.LongTensor(y_train)
# )
# train_loader = torch.utils.data.DataLoader(
#     train_dataset, 
#     batch_size=batch_size,
#     shuffle=True
# )

# val_dataset = torch.utils.data.TensorDataset(
#     torch.FloatTensor(X_test),
#     torch.LongTensor(y_test)
# )
# val_loader = torch.utils.data.DataLoader(
#     val_dataset,
#     batch_size=batch_size,
#     shuffle=False
# )

# # Training loop
# best_val_acc = 0
# for epoch in range(num_epochs):
#     # Training phase
#     model.train()
#     total_loss = 0
#     correct = 0
#     total = 0
    
#     for batch_X, batch_y in train_loader:
#         batch_X, batch_y = batch_X.to(DEVICE), batch_y.to(DEVICE)
        
#         optimizer.zero_grad()
#         outputs = model(batch_X)
#         loss = criterion(outputs, batch_y)
        
#         loss.backward()
#         optimizer.step()
        
#         total_loss += loss.item()
#         _, predicted = outputs.max(1)
#         total += batch_y.size(0)
#         correct += predicted.eq(batch_y).sum().item()
    
#     train_acc = 100. * correct / total
#     avg_loss = total_loss / len(train_loader)
    
#     # Validation phase
#     model.eval()
#     correct = 0
#     total = 0
#     val_loss = 0
    
#     with torch.no_grad():
#         for batch_X, batch_y in val_loader:
#             batch_X, batch_y = batch_X.to(DEVICE), batch_y.to(DEVICE)
#             outputs = model(batch_X)
#             loss = criterion(outputs, batch_y)
            
#             val_loss += loss.item()
#             _, predicted = outputs.max(1)
#             total += batch_y.size(0)
#             correct += predicted.eq(batch_y).sum().item()
    
#     val_acc = 100. * correct / total
#     avg_val_loss = val_loss / len(val_loader)
    
#     print(f'Epoch {epoch+1}/{num_epochs}:')
#     print(f'Training - Loss: {avg_loss:.4f}, Accuracy: {train_acc:.2f}%')
#     print(f'Validation - Loss: {avg_val_loss:.4f}, Accuracy: {val_acc:.2f}%')
#     print('-' * 60)
    
#     # Save best model
#     if val_acc > best_val_acc:
#         best_val_acc = val_acc
#         torch.save(model.state_dict(), 'best_model.pth')
#         print(f'Saved new best model with validation accuracy: {val_acc:.2f}%')

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# ----------------------------------------------------------
# ‚úÖ Fixed RoBERTaNet-like model (auto adapts to PCA dimension)
# ----------------------------------------------------------
class RoBERTaNetLike(nn.Module):
    def __init__(self, input_dim, num_classes, n_heads=4, ff_dim=None, dropout=0.3):
        super().__init__()
        if ff_dim is None:
            ff_dim = input_dim * 2  # auto scale based on PCA dim

        # Multi-head attention block
        self.mha = nn.MultiheadAttention(embed_dim=input_dim, num_heads=n_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(input_dim)

        # Feed-forward block
        self.ff = nn.Sequential(
            nn.Linear(input_dim, ff_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, input_dim)
        )
        self.norm2 = nn.LayerNorm(input_dim)

        # Classifier head
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, input_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(input_dim // 2, num_classes)
        )

    def forward(self, x):
        # x shape: (batch, features)
        x_seq = x.unsqueeze(1)  # add fake sequence dim (batch, seq=1, features)
        attn_out, _ = self.mha(x_seq, x_seq, x_seq)
        attn_out = attn_out.squeeze(1)
        x = self.norm1(x + attn_out)

        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)

        logits = self.classifier(x)
        return logits


# ----------------------------------------------------------
# ‚úÖ Initialize model using your PCA dimension
# ----------------------------------------------------------
num_classes = len(np.unique(y))
input_dim = X_train.shape[1]
model = RoBERTaNetLike(input_dim=input_dim, num_classes=num_classes).to(DEVICE)

print(f"‚úÖ Model initialized with input_dim={input_dim}, num_classes={num_classes}")
print(model)

# ----------------------------------------------------------
# ‚úÖ Create DataLoaders
# ----------------------------------------------------------
train_dataset = TensorDataset(X_train, y_train)
test_dataset  = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# ----------------------------------------------------------
# ‚úÖ Loss & Optimizer
# ----------------------------------------------------------
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# ----------------------------------------------------------
# ‚úÖ Training & Evaluation Functions
# ----------------------------------------------------------
def train_one_epoch(model, loader):
    model.train()
    total_loss = 0.0
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)


def evaluate(model, loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(DEVICE)
            out = model(xb)
            preds.extend(torch.argmax(out, dim=1).cpu().numpy())
            trues.extend(yb.numpy())
    acc = accuracy_score(trues, preds)
    prec = precision_score(trues, preds, average='macro', zero_division=0)
    rec = recall_score(trues, preds, average='macro', zero_division=0)
    f1 = f1_score(trues, preds, average='macro', zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


# ----------------------------------------------------------
# ‚úÖ Training Loop
# ----------------------------------------------------------
best_val_f1 = 0.0
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_one_epoch(model, train_loader)
    metrics = evaluate(model, test_loader)
    print(f"Epoch {epoch}/{NUM_EPOCHS} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Acc: {metrics['accuracy']:.4f} | "
          f"Val F1: {metrics['f1']:.4f}")
    if metrics["f1"] > best_val_f1:
        best_val_f1 = metrics["f1"]
        torch.save(model.state_dict(), "best_robertanet.pth")

print("üèÅ Training complete.")
print("üî• Best validation F1-score:", best_val_f1)


Epoch 1/10 - train_loss: 0.6589 - val_acc: 0.7951 val_f1: 0.7940
Epoch 2/10 - train_loss: 0.4775 - val_acc: 0.8093 val_f1: 0.8069
Epoch 2/10 - train_loss: 0.4775 - val_acc: 0.8093 val_f1: 0.8069
Epoch 3/10 - train_loss: 0.4468 - val_acc: 0.8104 val_f1: 0.8112
Epoch 3/10 - train_loss: 0.4468 - val_acc: 0.8104 val_f1: 0.8112
Epoch 4/10 - train_loss: 0.4236 - val_acc: 0.8149 val_f1: 0.8149
Epoch 4/10 - train_loss: 0.4236 - val_acc: 0.8149 val_f1: 0.8149
Epoch 5/10 - train_loss: 0.4054 - val_acc: 0.8128 val_f1: 0.8109
Epoch 5/10 - train_loss: 0.4054 - val_acc: 0.8128 val_f1: 0.8109
Epoch 6/10 - train_loss: 0.3898 - val_acc: 0.8135 val_f1: 0.8134
Epoch 6/10 - train_loss: 0.3898 - val_acc: 0.8135 val_f1: 0.8134
Epoch 7/10 - train_loss: 0.3779 - val_acc: 0.8153 val_f1: 0.8159
Epoch 7/10 - train_loss: 0.3779 - val_acc: 0.8153 val_f1: 0.8159
Epoch 8/10 - train_loss: 0.3614 - val_acc: 0.8159 val_f1: 0.8139
Epoch 8/10 - train_loss: 0.3614 - val_acc: 0.8159 val_f1: 0.8139
Epoch 9/10 - train_loss: 

In [13]:
# # Map labels to 2 classes
# y_train = torch.tensor([0 if l in [0,1,2] else 1 for l in y_train], dtype=torch.long)
# y_test  = torch.tensor([0 if l in [0,1,2] else 1 for l in y_test], dtype=torch.long)

# # Verify mapping
# print("Unique labels in y_train after mapping:", torch.unique(y_train))
# print("Unique labels in y_test after mapping:", torch.unique(y_test))



In [14]:
model.load_state_dict(torch.load("best_robertanet.pth"))
final_metrics = evaluate(model, test_loader)
print("RoBERTaNet-like final metrics:", final_metrics)


RoBERTaNet-like final metrics: {'accuracy': 0.8159136177796414, 'precision': 0.8217898129474578, 'recall': 0.8153172778680111, 'f1': 0.8166049921370869}


In [15]:
# Convert PCA arrays to numpy
X_train_np = X_train.cpu().numpy()
X_test_np  = X_test.cpu().numpy()
y_train_np = y_train.cpu().numpy()
y_test_np  = y_test.cpu().numpy()

ml_models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', probability=True, random_state=42),
    "NaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

ml_results = []
for name, clf in ml_models.items():
    clf.fit(X_train_np, y_train_np)
    preds = clf.predict(X_test_np)
    ml_results.append([
        name,
        accuracy_score(y_test_np, preds),
        precision_score(y_test_np, preds, average='macro', zero_division=0),
        recall_score(y_test_np, preds, average='macro', zero_division=0),
        f1_score(y_test_np, preds, average='macro', zero_division=0)
    ])

ml_df = pd.DataFrame(ml_results, columns=['Model','Accuracy','Precision','Recall','F1'])
print("Baseline ML results:\n", ml_df)


[WinError 2] The system cannot find the file specified
  File "q:\Anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "q:\Anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "q:\Anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "q:\Anaconda3\Lib\subp

Baseline ML results:
           Model  Accuracy  Precision    Recall        F1
0  RandomForest  0.743684   0.742201  0.742557  0.741313
1           SVM  0.774609   0.770843  0.773873  0.770188
2    NaiveBayes  0.482126   0.506838  0.481336  0.470877
3           KNN  0.692525   0.671667  0.690988  0.660945


In [16]:
# We use the PCA vector as sequence length = 1; for deeper baselines we reshape accordingly.
# For a lightweight CNN/BiLSTM demo we'll adapt shapes but keep them simple.

class SimpleCNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.conv = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=1)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(64, num_classes)
    def forward(self, x):
        x = x.unsqueeze(1) # (B,1,D)
        x = F.relu(self.conv(x)) # (B,64,D)
        x = self.pool(x).squeeze(-1) # (B,64)
        return self.fc(x)

class SimpleBiLSTM(nn.Module):
    def __init__(self, input_dim, hidden=128, num_classes=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden*2, num_classes)
    def forward(self, x):
        x = x.unsqueeze(1)  # (B,1,D)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        return self.fc(out)

# instantiate
cnn = SimpleCNN(X_train.shape[1], num_classes).to(DEVICE)
bilstm = SimpleBiLSTM(X_train.shape[1], hidden=128, num_classes=num_classes).to(DEVICE)

def train_and_eval_torch_model(torch_model, epochs=5):
    torch_model.to(DEVICE)
    opt = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
    crit = nn.CrossEntropyLoss()
    for ep in range(epochs):
        torch_model.train()
        total=0
        for xb, yb in train_loader:
            xb,yb = xb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad()
            loss = crit(torch_model(xb), yb)
            loss.backward()
            opt.step()
            total += loss.item()*xb.size(0)
        # no verbose per batch
    # eval
    torch_model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(DEVICE)
            out = torch_model(xb)
            preds.extend(torch.argmax(out, dim=1).cpu().numpy())
            trues.extend(yb.numpy())
    return accuracy_score(trues, preds), precision_score(trues, preds, average='macro', zero_division=0), recall_score(trues, preds, average='macro', zero_division=0), f1_score(trues, preds, average='macro', zero_division=0)

cnn_metrics = train_and_eval_torch_model(cnn, epochs=5)
bilstm_metrics = train_and_eval_torch_model(bilstm, epochs=5)
deep_df = pd.DataFrame([
    ["CNN", *cnn_metrics],
    ["BiLSTM", *bilstm_metrics]
], columns=['Model','Accuracy','Precision','Recall','F1'])
print("Deep baseline results:\n", deep_df)


Deep baseline results:
     Model  Accuracy  Precision    Recall        F1
0     CNN  0.336618   0.320126  0.335863  0.289882
1  BiLSTM  0.804487   0.802498  0.803738  0.800928


In [17]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

cv_acc = cross_val_score(rf, X_pca, y, cv=kf, scoring='accuracy', n_jobs=-1)
cv_prec = cross_val_score(rf, X_pca, y, cv=kf, scoring='precision_macro', n_jobs=-1)
cv_rec = cross_val_score(rf, X_pca, y, cv=kf, scoring='recall_macro', n_jobs=-1)
cv_f1 = cross_val_score(rf, X_pca, y, cv=kf, scoring='f1_macro', n_jobs=-1)

print("5-fold CV (RandomForest):")
print("Accuracy:", np.mean(cv_acc))
print("Precision:", np.mean(cv_prec))
print("Recall:", np.mean(cv_rec))
print("F1:", np.mean(cv_f1))


5-fold CV (RandomForest):
Accuracy: 0.7480291248312789
Precision: 0.7463899129081428
Recall: 0.746974082414235
F1: 0.7453651825347777


In [18]:
# Collect RoBERTaNet results + baselines into one table
proposed = ["RoBERTaNet (PCA+GloVe)", final_metrics['accuracy'], final_metrics['precision'], final_metrics['recall'], final_metrics['f1']]
all_results = pd.concat([ml_df, deep_df], ignore_index=True)
all_results.loc[len(all_results)] = proposed
all_results = all_results[['Model','Accuracy','Precision','Recall','F1']]
print("Final comparison table:")
display(all_results.sort_values(by='Accuracy', ascending=False).reset_index(drop=True))


Final comparison table:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,RoBERTaNet (PCA+GloVe),0.815914,0.82179,0.815317,0.816605
1,BiLSTM,0.804487,0.802498,0.803738,0.800928
2,SVM,0.774609,0.770843,0.773873,0.770188
3,RandomForest,0.743684,0.742201,0.742557,0.741313
4,KNN,0.692525,0.671667,0.690988,0.660945
5,NaiveBayes,0.482126,0.506838,0.481336,0.470877
6,CNN,0.336618,0.320126,0.335863,0.289882


In [19]:
# LIME expects a function that returns class probabilities for a list of raw texts.
explainer = LimeTextExplainer(class_names=list(classes))

def predict_proba_from_texts(texts):
    # texts -> glove -> pca -> model -> softmax probs
    X_tmp = np.vstack([get_glove_vector(clean_text(t)) for t in texts])
    X_tmp_pca = pca.transform(X_tmp)
    X_tmp_tensor = torch.tensor(X_tmp_pca, dtype=torch.float32).to(DEVICE)
    with torch.no_grad():
        logits = model(X_tmp_tensor)
        probs = F.softmax(logits, dim=1).cpu().numpy()
    return probs

# pick some examples and explain
for idx in range(3):
    text = df['clean_text'].iloc[idx]
    exp = explainer.explain_instance(text, predict_proba_from_texts, num_features=10)
    print("Original:", df[TEXT_COL].iloc[idx])
    display(exp.as_list())  # prints (feature, weight) pairs
    print("-"*80)


Original: In other words #katandandre, your food was crapilicious! #mkr


[(np.str_('food'), -0.002721404276564852),
 (np.str_('words'), -0.001109889114536454),
 (np.str_('crapilicious'), 0.00026098344418575226)]

--------------------------------------------------------------------------------
Original: Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc
Original: Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc


[(np.str_('white'), -1.613531223947455e-05)]

--------------------------------------------------------------------------------
Original: @XochitlSuckkks a classy whore? Or more red velvet cupcakes?
Original: @XochitlSuckkks a classy whore? Or more red velvet cupcakes?


[(np.str_('whore'), 0.0053927520000426514),
 (np.str_('classy'), -0.004928811283545458),
 (np.str_('velvet'), 0.002638974555206073),
 (np.str_('red'), 0.0021224102262066575),
 (np.str_('cupcakes'), 0.0015056497165607014)]

--------------------------------------------------------------------------------
