<a href="https://colab.research.google.com/github/hongyuw0427/Final-Year-Project/blob/main/FYP_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ===============================
#1: DATA CLEANING & PREPROCESSING
#===============================

In [None]:
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

# ----------------------------
# Load raw dataset
# ----------------------------
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/FYP/cyberbullying_tweets.csv"
df = pd.read_csv(DATA_PATH)

print("Initial shape:", df.shape)

# Standardize column names
df.columns = [c.lower() for c in df.columns]

if "tweet_text" not in df.columns:
    raise KeyError("tweet_text column not found")

# Drop duplicates
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"Removed {before-after} duplicate rows")

# Check null values
print("Null tweet_text before fill:", df["tweet_text"].isnull().sum())

# Fill NaN
df["tweet_text"] = df["tweet_text"].fillna("").astype(str)

# ----------------------------
# Text cleaning
# ----------------------------
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def basic_clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'#\w+', ' ', text)
    text = text.encode('ascii', errors='ignore').decode()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_text_function(text: str) -> str:
    text = basic_clean(text)
    tokens = text.split()
    tokens = [tok for tok in tokens if tok not in stop_words and len(tok) > 1]
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
    return " ".join(tokens)

df["clean_text"] = df["tweet_text"].apply(clean_text_function)

# Drop empty clean_text
before = df.shape[0]
df = df[df["clean_text"].str.strip() != ""].copy()
after = df.shape[0]

print(f"Dropped {before-after} empty rows")
print("After cleaning:", df.shape)

# Class distribution
print("\nClass distribution:")
print(df["cyberbullying_type"].value_counts())

# Save cleaned dataset
CLEAN_PATH = "/content/drive/MyDrive/Colab Notebooks/FYP/cyberbullying_tweets_cleaned.csv"
df.to_csv(CLEAN_PATH, index=False)
print("Saved cleaned dataset:", CLEAN_PATH)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initial shape: (47692, 2)
Removed 43 duplicate rows
Null tweet_text before fill: 0
Dropped 498 empty rows
After cleaning: (47151, 3)

Class distribution:
cyberbullying_type
religion               7996
age                    7989
ethnicity              7952
gender                 7896
not_cyberbullying      7759
other_cyberbullying    7559
Name: count, dtype: int64
Saved cleaned dataset: /content/drive/MyDrive/Colab Notebooks/FYP/cyberbullying_tweets_cleaned.csv


# ===============================
#2: OUTLIER DETECTION & REMOVAL
#===============================

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

# ----------------------------
# Load cleaned dataset
# ----------------------------
df = pd.read_csv(CLEAN_PATH)
texts = df["clean_text"].astype(str).tolist()

print("Dataset before outlier removal:", df.shape)
print("\nClass distribution BEFORE:")
print(df["cyberbullying_type"].value_counts())

# ----------------------------
# Compute sentence embeddings
# ----------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")
X_embed = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# ----------------------------
# Scale + PCA
# ----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_embed)

pca = PCA(n_components=50, random_state=42)
X_reduced = pca.fit_transform(X_scaled)

# ----------------------------
# Isolation Forest
# ----------------------------
iso = IsolationForest(
    n_estimators=100,
    contamination=0.03,
    random_state=42
)

outlier_labels = iso.fit_predict(X_reduced)
df["outlier"] = outlier_labels

n_outliers = (df["outlier"] == -1).sum()
print(f"\nDetected outliers: {n_outliers} ({n_outliers/len(df):.2%})")

# Keep inliers only
df = df[df["outlier"] == 1].copy()

print("\nClass distribution AFTER:")
print(df["cyberbullying_type"].value_counts())
print("Dataset after outlier removal:", df.shape)

# Drop helper column
df.drop(columns=["outlier"], inplace=True)

# Save final dataset
FINAL_PATH = "/content/drive/MyDrive/Colab Notebooks/FYP/cyberbullying_tweets_final_cleaned.csv"
df.to_csv(FINAL_PATH, index=False)
print("Saved final dataset:", FINAL_PATH)




Dataset before outlier removal: (47151, 3)

Class distribution BEFORE:
cyberbullying_type
religion               7996
age                    7989
ethnicity              7952
gender                 7896
not_cyberbullying      7759
other_cyberbullying    7559
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1474 [00:00<?, ?it/s]


Detected outliers: 1415 (3.00%)

Class distribution AFTER:
cyberbullying_type
age                    7954
ethnicity              7847
religion               7698
gender                 7570
not_cyberbullying      7377
other_cyberbullying    7290
Name: count, dtype: int64
Dataset after outlier removal: (45736, 4)
Saved final dataset: /content/drive/MyDrive/Colab Notebooks/FYP/cyberbullying_tweets_final_cleaned.csv


# ===============================
#3: BASELINE MODEL EXPERIMENTS
#===============================

In [None]:
# ==========================================
# SCRIPT 3: BASELINE MODEL EXPERIMENTS (+ CM SAVE)
# ==========================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score, confusion_matrix
)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers, callbacks

# ----------------------------
# Load dataset (CLEANED + NO OUTLIERS)
# ----------------------------
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/FYP/cyberbullying_tweets_final_cleaned.csv"
df = pd.read_csv(DATA_PATH)

print("Loaded dataset shape:", df.shape)

# ----------------------------
# Class distribution
# ----------------------------
print("\nClass distribution (after cleaning & outlier removal):")
print(df["cyberbullying_type"].value_counts())

# ----------------------------
# Prepare data
# ----------------------------
texts = df["clean_text"].astype(str).tolist()
labels = df["cyberbullying_type"].astype(str)

le = LabelEncoder()
y = le.fit_transform(labels)
label_names = le.classes_
num_classes = len(label_names)

# ----------------------------
# Config
# ----------------------------
SPLITS = [(0.9,0.1), (0.8,0.2), (0.7,0.3), (0.6,0.4)]
SEED = 42

results = []

# Output paths
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/baseline_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CM_DIR = os.path.join(OUTPUT_DIR, "confusion_matrices")
os.makedirs(CM_DIR, exist_ok=True)

# ----------------------------
# Confusion matrix saver (NO PRINT)
# ----------------------------
def save_confusion_matrix(y_true, y_pred, labels, out_path, title=None):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    if title:
        plt.title(title)
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()

# ==========================================================
# CORRECTED BASELINE LOOP (NO LEAKAGE)
# ==========================================================
for train_frac, test_frac in SPLITS:

    split_tag = f"{int(train_frac*100)}_{int(test_frac*100)}"

    print(f"\n" + "="*40)
    print(f"TRAIN {int(train_frac*100)}% | TEST {int(test_frac*100)}%")
    print(f"========================================")

    # 1. SPLIT FIRST (Stratified)
    # We split INDICES or TEXTS first, before vectorization
    X_train_text, X_test_text, y_train, y_test = train_test_split(
        texts, y,
        test_size=test_frac,
        random_state=SEED,
        stratify=y
    )

    y_test_bin = label_binarize(y_test, classes=range(num_classes))

    # ----------------------------
    # 2. FIT VECTORIZER (On Train ONLY)
    # ----------------------------
    print("Generating Count Features (BoW)...")
    vectorizer = CountVectorizer() # Reset for every split
    X_train_cv = vectorizer.fit_transform(X_train_text) # FIT on TRAIN
    X_test_cv  = vectorizer.transform(X_test_text)      # TRANSFORM TEST

    # ----------------------------
    # 3. FIT TOKENIZER (On Train ONLY)
    # ----------------------------
    print("Generating Sequences for Deep Learning...")
    tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train_text) # FIT on TRAIN

    X_train_seq = tokenizer.texts_to_sequences(X_train_text)
    X_test_seq  = tokenizer.texts_to_sequences(X_test_text)

    X_train_pad = pad_sequences(X_train_seq, maxlen=100)
    X_test_pad  = pad_sequences(X_test_seq, maxlen=100)

    # ----------------------------
    # MODEL A: Logistic Regression
    # ----------------------------
    print("Training LR...")
    # UPDATED: Changed to max_iter=2000 to match FE script
    lr = LogisticRegression(max_iter=2000)
    lr.fit(X_train_cv, y_train)

    y_pred = lr.predict(X_test_cv)
    y_prob = lr.predict_proba(X_test_cv)

    results.append({
        "model": "LR_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_LR_BASE_{split_tag}.png"),
        title=f"LR_BASE {split_tag}"
    )

    # ----------------------------
    # MODEL B: Random Forest
    # ----------------------------
    print("Training RF...")
    # UPDATED: Added n_jobs=-1 for speed
    rf = RandomForestClassifier(n_jobs=-1)
    rf.fit(X_train_cv, y_train)

    y_pred = rf.predict(X_test_cv)
    y_prob = rf.predict_proba(X_test_cv)

    results.append({
        "model": "RF_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_RF_BASE_{split_tag}.png"),
        title=f"RF_BASE {split_tag}"
    )

    # ----------------------------
    # MODEL C: SVM (Standard SVC)
    # ----------------------------
    # Note: We keep SVC here as "Baseline" vs LinearSVC in FE. This is fine.
    print("Training SVM...")
    svm = SVC(kernel="linear", probability=True)
    svm.fit(X_train_cv, y_train)

    y_pred = svm.predict(X_test_cv)
    y_prob = svm.predict_proba(X_test_cv)

    results.append({
        "model": "SVM_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_SVM_BASE_{split_tag}.png"),
        title=f"SVM_BASE {split_tag}"
    )

    # ----------------------------
    # MODEL D: LSTM Baseline
    # ----------------------------
    print("Training LSTM...")
    lstm = Sequential([
        layers.Embedding(20000, 64),
        layers.LSTM(64),
        layers.Dense(num_classes, activation="softmax")
    ])

    lstm.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    # Using validation_split on the TRAIN set only
    lstm.fit(
        X_train_pad, y_train,
        epochs=4,
        batch_size=128,
        validation_split=0.1,
        callbacks=[callbacks.EarlyStopping(patience=1, restore_best_weights=True)],
        verbose=1
    )

    y_prob = lstm.predict(X_test_pad)
    y_pred = np.argmax(y_prob, axis=1)

    results.append({
        "model": "LSTM_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_LSTM_BASE_{split_tag}.png"),
        title=f"LSTM_BASE {split_tag}"
    )

    # ----------------------------
    # MODEL E: BiLSTM Baseline
    # ----------------------------
    print("Training BiLSTM...")
    bilstm = Sequential([
        layers.Embedding(20000, 64),
        layers.Bidirectional(layers.LSTM(64)),
        layers.Dense(num_classes, activation="softmax")
    ])

    bilstm.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    bilstm.fit(
        X_train_pad, y_train,
        epochs=4,
        batch_size=128,
        validation_split=0.1,
        callbacks=[callbacks.EarlyStopping(patience=1, restore_best_weights=True)],
        verbose=1
    )

    y_prob = bilstm.predict(X_test_pad)
    y_pred = np.argmax(y_prob, axis=1)

    results.append({
        "model": "BiLSTM_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_BiLSTM_BASE_{split_tag}.png"),
        title=f"BiLSTM_BASE {split_tag}"
    )

# ----------------------------
# Save baseline results
# ----------------------------
results_df = pd.DataFrame(results)
SAVE_PATH = os.path.join(OUTPUT_DIR, "baseline_results.csv")
results_df.to_csv(SAVE_PATH, index=False)

print("\nBaseline experiments completed.")
print("Saved results to:", SAVE_PATH)
print("Confusion matrices saved to:", CM_DIR)


Loaded dataset shape: (45736, 3)

Class distribution (after cleaning & outlier removal):
cyberbullying_type
age                    7954
ethnicity              7847
religion               7698
gender                 7570
not_cyberbullying      7377
other_cyberbullying    7290
Name: count, dtype: int64

TRAIN 90% | TEST 10%
Generating Count Features (BoW)...
Generating Sequences for Deep Learning...
Training LR...
Training RF...
Training SVM...
Training LSTM...
Epoch 1/4




[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 160ms/step - accuracy: 0.5996 - loss: 1.0616 - val_accuracy: 0.8149 - val_loss: 0.4499
Epoch 2/4
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 165ms/step - accuracy: 0.8488 - loss: 0.3711 - val_accuracy: 0.8266 - val_loss: 0.4226
Epoch 3/4
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 160ms/step - accuracy: 0.8885 - loss: 0.2800 - val_accuracy: 0.8237 - val_loss: 0.4565
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step
Training BiLSTM...
Epoch 1/4




[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 278ms/step - accuracy: 0.5887 - loss: 1.0814 - val_accuracy: 0.8142 - val_loss: 0.4550
Epoch 2/4
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 278ms/step - accuracy: 0.8336 - loss: 0.3962 - val_accuracy: 0.8096 - val_loss: 0.4379
Epoch 3/4
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 279ms/step - accuracy: 0.8824 - loss: 0.2982 - val_accuracy: 0.8237 - val_loss: 0.4534
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step

TRAIN 80% | TEST 20%
Generating Count Features (BoW)...
Generating Sequences for Deep Learning...
Training LR...
Training RF...
Training SVM...
Training LSTM...
Epoch 1/4




[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 157ms/step - accuracy: 0.5726 - loss: 1.0983 - val_accuracy: 0.7967 - val_loss: 0.4684
Epoch 2/4
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 159ms/step - accuracy: 0.8378 - loss: 0.3907 - val_accuracy: 0.8213 - val_loss: 0.4184
Epoch 3/4
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 158ms/step - accuracy: 0.8893 - loss: 0.2835 - val_accuracy: 0.8232 - val_loss: 0.4426
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step
Training BiLSTM...
Epoch 1/4




[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 272ms/step - accuracy: 0.5208 - loss: 1.1467 - val_accuracy: 0.7835 - val_loss: 0.4938
Epoch 2/4
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 272ms/step - accuracy: 0.8222 - loss: 0.4077 - val_accuracy: 0.8196 - val_loss: 0.4314
Epoch 3/4
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 272ms/step - accuracy: 0.8784 - loss: 0.3031 - val_accuracy: 0.8161 - val_loss: 0.4530
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 27ms/step

TRAIN 70% | TEST 30%
Generating Count Features (BoW)...
Generating Sequences for Deep Learning...
Training LR...
Training RF...
Training SVM...
Training LSTM...
Epoch 1/4




[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 157ms/step - accuracy: 0.5637 - loss: 1.1688 - val_accuracy: 0.8020 - val_loss: 0.4439
Epoch 2/4
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 159ms/step - accuracy: 0.8337 - loss: 0.4015 - val_accuracy: 0.8101 - val_loss: 0.4245
Epoch 3/4
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 163ms/step - accuracy: 0.8838 - loss: 0.3026 - val_accuracy: 0.8154 - val_loss: 0.4329
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step
Training BiLSTM...
Epoch 1/4




[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 286ms/step - accuracy: 0.5258 - loss: 1.1807 - val_accuracy: 0.7829 - val_loss: 0.4699
Epoch 2/4
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 268ms/step - accuracy: 0.8076 - loss: 0.4263 - val_accuracy: 0.8032 - val_loss: 0.4309
Epoch 3/4
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 270ms/step - accuracy: 0.8631 - loss: 0.3331 - val_accuracy: 0.8195 - val_loss: 0.4340
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 23ms/step

TRAIN 60% | TEST 40%
Generating Count Features (BoW)...
Generating Sequences for Deep Learning...
Training LR...
Training RF...
Training SVM...
Training LSTM...
Epoch 1/4




[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 162ms/step - accuracy: 0.4955 - loss: 1.2131 - val_accuracy: 0.7887 - val_loss: 0.4613
Epoch 2/4
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 169ms/step - accuracy: 0.8172 - loss: 0.4239 - val_accuracy: 0.8237 - val_loss: 0.4116
Epoch 3/4
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 157ms/step - accuracy: 0.8797 - loss: 0.3012 - val_accuracy: 0.8328 - val_loss: 0.4256
[1m572/572[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step
Training BiLSTM...
Epoch 1/4




[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 276ms/step - accuracy: 0.5037 - loss: 1.2475 - val_accuracy: 0.7792 - val_loss: 0.4765
Epoch 2/4
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 274ms/step - accuracy: 0.8017 - loss: 0.4402 - val_accuracy: 0.8069 - val_loss: 0.4345
Epoch 3/4
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 281ms/step - accuracy: 0.8616 - loss: 0.3405 - val_accuracy: 0.8215 - val_loss: 0.4202
Epoch 4/4
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 274ms/step - accuracy: 0.9056 - loss: 0.2479 - val_accuracy: 0.8036 - val_loss: 0.4645
[1m572/572[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 24ms/step

Baseline experiments completed.
Saved results to: /content/drive/MyDrive/Colab Notebooks/FYP/baseline_results/baseline_results.csv
Confusion matrices saved to: /content/drive/MyDrive/Colab Notebooks/FYP/baseline_results/confusion_matrices


# ===============================
# Rerun Baseline experiments after dropping Overlap Rows
#===============================

In [None]:
# ==========================================
# SCRIPT 3: BASELINE MODEL EXPERIMENTS (+ CM SAVE)
# ==========================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score, confusion_matrix
)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers, callbacks

# ----------------------------
# Load dataset (CLEANED + NO OUTLIERS)
# ----------------------------
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/FYP/cyberbullying_cleaned_dropOverlappedRows.csv"
df = pd.read_csv(DATA_PATH)

print("Loaded dataset shape:", df.shape)

# ----------------------------
# Class distribution
# ----------------------------
print("\nClass distribution (after cleaning & outlier removal):")
print(df["cyberbullying_type"].value_counts())

# ----------------------------
# Prepare data
# ----------------------------
texts = df["clean_text"].astype(str).tolist()
labels = df["cyberbullying_type"].astype(str)

le = LabelEncoder()
y = le.fit_transform(labels)
label_names = le.classes_
num_classes = len(label_names)

# ----------------------------
# Config
# ----------------------------
SPLITS = [(0.9,0.1), (0.8,0.2), (0.7,0.3), (0.6,0.4)]
SEED = 42

results = []

# Output paths
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/baseline_results_droppedOverlap"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CM_DIR = os.path.join(OUTPUT_DIR, "confusion_matrices")
os.makedirs(CM_DIR, exist_ok=True)

# ----------------------------
# Confusion matrix saver (NO PRINT)
# ----------------------------
def save_confusion_matrix(y_true, y_pred, labels, out_path, title=None):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    if title:
        plt.title(title)
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()

# ==========================================================
# CORRECTED BASELINE LOOP (NO LEAKAGE)
# ==========================================================
for train_frac, test_frac in SPLITS:

    split_tag = f"{int(train_frac*100)}_{int(test_frac*100)}"

    print(f"\n" + "="*40)
    print(f"TRAIN {int(train_frac*100)}% | TEST {int(test_frac*100)}%")
    print(f"========================================")

    # 1. SPLIT FIRST (Stratified)
    # We split INDICES or TEXTS first, before vectorization
    X_train_text, X_test_text, y_train, y_test = train_test_split(
        texts, y,
        test_size=test_frac,
        random_state=SEED,
        stratify=y
    )

    y_test_bin = label_binarize(y_test, classes=range(num_classes))

    # ----------------------------
    # 2. FIT VECTORIZER (On Train ONLY)
    # ----------------------------
    print("Generating Count Features (BoW)...")
    vectorizer = CountVectorizer() # Reset for every split
    X_train_cv = vectorizer.fit_transform(X_train_text) # FIT on TRAIN
    X_test_cv  = vectorizer.transform(X_test_text)      # TRANSFORM TEST

    # ----------------------------
    # 3. FIT TOKENIZER (On Train ONLY)
    # ----------------------------
    print("Generating Sequences for Deep Learning...")
    tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train_text) # FIT on TRAIN

    X_train_seq = tokenizer.texts_to_sequences(X_train_text)
    X_test_seq  = tokenizer.texts_to_sequences(X_test_text)

    X_train_pad = pad_sequences(X_train_seq, maxlen=100)
    X_test_pad  = pad_sequences(X_test_seq, maxlen=100)

    # ----------------------------
    # MODEL A: Logistic Regression
    # ----------------------------
    print("Training LR...")
    # UPDATED: Changed to max_iter=2000 to match FE script
    lr = LogisticRegression(max_iter=2000)
    lr.fit(X_train_cv, y_train)

    y_pred = lr.predict(X_test_cv)
    y_prob = lr.predict_proba(X_test_cv)

    results.append({
        "model": "LR_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_LR_BASE_{split_tag}.png"),
        title=f"LR_BASE {split_tag}"
    )

    # ----------------------------
    # MODEL B: Random Forest
    # ----------------------------
    print("Training RF...")
    # UPDATED: Added n_jobs=-1 for speed
    rf = RandomForestClassifier(n_jobs=-1)
    rf.fit(X_train_cv, y_train)

    y_pred = rf.predict(X_test_cv)
    y_prob = rf.predict_proba(X_test_cv)

    results.append({
        "model": "RF_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_RF_BASE_{split_tag}.png"),
        title=f"RF_BASE {split_tag}"
    )

    # ----------------------------
    # MODEL C: SVM (Standard SVC)
    # ----------------------------
    # Note: We keep SVC here as "Baseline" vs LinearSVC in FE. This is fine.
    print("Training SVM...")
    svm = SVC(kernel="linear", probability=True)
    svm.fit(X_train_cv, y_train)

    y_pred = svm.predict(X_test_cv)
    y_prob = svm.predict_proba(X_test_cv)

    results.append({
        "model": "SVM_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_SVM_BASE_{split_tag}.png"),
        title=f"SVM_BASE {split_tag}"
    )

    # ----------------------------
    # MODEL D: LSTM Baseline
    # ----------------------------
    print("Training LSTM...")
    lstm = Sequential([
        layers.Embedding(20000, 64),
        layers.LSTM(64),
        layers.Dense(num_classes, activation="softmax")
    ])

    lstm.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    # Using validation_split on the TRAIN set only
    lstm.fit(
        X_train_pad, y_train,
        epochs=4,
        batch_size=128,
        validation_split=0.1,
        callbacks=[callbacks.EarlyStopping(patience=1, restore_best_weights=True)],
        verbose=1
    )

    y_prob = lstm.predict(X_test_pad)
    y_pred = np.argmax(y_prob, axis=1)

    results.append({
        "model": "LSTM_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_LSTM_BASE_{split_tag}.png"),
        title=f"LSTM_BASE {split_tag}"
    )

    # ----------------------------
    # MODEL E: BiLSTM Baseline
    # ----------------------------
    print("Training BiLSTM...")
    bilstm = Sequential([
        layers.Embedding(20000, 64),
        layers.Bidirectional(layers.LSTM(64)),
        layers.Dense(num_classes, activation="softmax")
    ])

    bilstm.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    bilstm.fit(
        X_train_pad, y_train,
        epochs=4,
        batch_size=128,
        validation_split=0.1,
        callbacks=[callbacks.EarlyStopping(patience=1, restore_best_weights=True)],
        verbose=1
    )

    y_prob = bilstm.predict(X_test_pad)
    y_pred = np.argmax(y_prob, axis=1)

    results.append({
        "model": "BiLSTM_BASE",
        "train_frac": train_frac,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro"),
        "recall": recall_score(y_test, y_pred, average="macro"),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_bin, y_prob, multi_class="ovr")
    })

    save_confusion_matrix(
        y_test, y_pred, label_names,
        out_path=os.path.join(CM_DIR, f"CM_BiLSTM_BASE_{split_tag}.png"),
        title=f"BiLSTM_BASE {split_tag}"
    )

# ----------------------------
# Save baseline results
# ----------------------------
results_df = pd.DataFrame(results)
SAVE_PATH = os.path.join(OUTPUT_DIR, "baseline_results_droppedOverlap.csv")
results_df.to_csv(SAVE_PATH, index=False)

print("\nBaseline experiments completed.")
print("Saved results to:", SAVE_PATH)
print("Confusion matrices saved to:", CM_DIR)


Loaded dataset shape: (38446, 3)

Class distribution (after cleaning & outlier removal):
cyberbullying_type
age                  7954
ethnicity            7847
religion             7698
gender               7570
not_cyberbullying    7377
Name: count, dtype: int64

TRAIN 90% | TEST 10%
Generating Count Features (BoW)...
Generating Sequences for Deep Learning...
Training LR...
Training RF...
Training SVM...
Training LSTM...
Epoch 1/4
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 154ms/step - accuracy: 0.6837 - loss: 0.8954 - val_accuracy: 0.9246 - val_loss: 0.2133
Epoch 2/4
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 160ms/step - accuracy: 0.9376 - loss: 0.1790 - val_accuracy: 0.9301 - val_loss: 0.1962
Epoch 3/4
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 167ms/step - accuracy: 0.9590 - loss: 0.1219 - val_accuracy: 0.9205 - val_loss: 0.2277
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step
Traini



[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 273ms/step - accuracy: 0.6470 - loss: 0.9144 - val_accuracy: 0.9203 - val_loss: 0.2220
Epoch 2/4
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 270ms/step - accuracy: 0.9353 - loss: 0.1947 - val_accuracy: 0.9260 - val_loss: 0.1960
Epoch 3/4
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 276ms/step - accuracy: 0.9571 - loss: 0.1309 - val_accuracy: 0.9252 - val_loss: 0.2226
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step

TRAIN 80% | TEST 20%
Generating Count Features (BoW)...
Generating Sequences for Deep Learning...
Training LR...
Training RF...
Training SVM...
Training LSTM...
Epoch 1/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 168ms/step - accuracy: 0.6651 - loss: 0.9244 - val_accuracy: 0.9151 - val_loss: 0.2283
Epoch 2/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 163ms/step - accuracy: 0.9366 - loss: 0



[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 272ms/step - accuracy: 0.6483 - loss: 0.9556 - val_accuracy: 0.9164 - val_loss: 0.2309
Epoch 2/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 271ms/step - accuracy: 0.9349 - loss: 0.1955 - val_accuracy: 0.9269 - val_loss: 0.2066
Epoch 3/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 273ms/step - accuracy: 0.9578 - loss: 0.1298 - val_accuracy: 0.9278 - val_loss: 0.2034
Epoch 4/4
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 274ms/step - accuracy: 0.9706 - loss: 0.0974 - val_accuracy: 0.9272 - val_loss: 0.2204
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step

TRAIN 70% | TEST 30%
Generating Count Features (BoW)...
Generating Sequences for Deep Learning...
Training LR...
Training RF...
Training SVM...
Training LSTM...
Epoch 1/4
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 170ms/step - accuracy: 0.6494 - loss: 1



[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 273ms/step - accuracy: 0.6397 - loss: 1.0383 - val_accuracy: 0.9131 - val_loss: 0.2470
Epoch 2/4
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 266ms/step - accuracy: 0.9288 - loss: 0.2098 - val_accuracy: 0.9257 - val_loss: 0.2190
Epoch 3/4
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 280ms/step - accuracy: 0.9545 - loss: 0.1357 - val_accuracy: 0.9220 - val_loss: 0.2285
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step

TRAIN 60% | TEST 40%
Generating Count Features (BoW)...
Generating Sequences for Deep Learning...
Training LR...
Training RF...
Training SVM...
Training LSTM...
Epoch 1/4
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 159ms/step - accuracy: 0.6121 - loss: 1.0651 - val_accuracy: 0.9189 - val_loss: 0.2427
Epoch 2/4
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 160ms/step - accuracy: 0.9292 - loss: 



[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 273ms/step - accuracy: 0.6470 - loss: 1.0569 - val_accuracy: 0.9198 - val_loss: 0.2405
Epoch 2/4
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 272ms/step - accuracy: 0.9289 - loss: 0.2081 - val_accuracy: 0.9237 - val_loss: 0.2183
Epoch 3/4
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 273ms/step - accuracy: 0.9544 - loss: 0.1328 - val_accuracy: 0.9254 - val_loss: 0.2151
Epoch 4/4
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 275ms/step - accuracy: 0.9738 - loss: 0.0869 - val_accuracy: 0.9215 - val_loss: 0.2396
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 23ms/step

Baseline experiments completed.
Saved results to: /content/drive/MyDrive/Colab Notebooks/FYP/baseline_results_droppedOverlap/baseline_results_droppedOverlap.csv
Confusion matrices saved to: /content/drive/MyDrive/Colab Notebooks/FYP/baseline_results_droppedOverlap/confusion_m