In [None]:
# standard libraries
import numpy as np
import matplotlib.pyplot as plt

# additional libraries
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from pandas import read_pickle

# Scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    f1_score, precision_score, recall_score, ConfusionMatrixDisplay
)
from sklearn.model_selection import (
    GridSearchCV, StratifiedKFold
)

# Imbalanced-learn 
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [None]:
df = pd.read_pickle('path/Dataset/New_Preprocessed_Dataset.pkl')

In [3]:
# rename label : none to normal
df['label'] = df['label'].replace('none', 'normal')

In [None]:
embeddings = np.array(df['embedding'].tolist()) 
labels = df['label'].tolist() 

## Random Forest

In [None]:
def train_rf_with_gridsearch(X_train, y_train, X_test, y_test):
    # grid search parameters for Random Forest
    param_grid_rf = {
        'n_estimators': [50],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5]
    }

    rf_classifier = RandomForestClassifier(random_state=42)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # GridSearchCV
    grid_search_rf = GridSearchCV(
        rf_classifier,
        param_grid=param_grid_rf,
        cv=cv,
        scoring='f1_macro',
        n_jobs=-1,
        verbose=1
    )

    # training
    grid_search_rf.fit(X_train, y_train)

    return grid_search_rf


In [None]:
def cross_val_with_oversampling(X, y, oversampler, train_fn, n_splits=5, class_names=["normal", "racism", "sexism"]):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    all_f1_scores = []
    all_accuracies = []
    all_precisions = []
    all_recalls = []
    all_conf_matrices = []
    best_params_list = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\nFold {fold} ----------------------------")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = np.array(y)[train_idx], np.array(y)[val_idx]

        # Oversampling
        X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)

        # Model training
        grid = train_fn(X_train_over, y_train_over, X_val, y_val)
        best_model = grid.best_estimator_

        # Predictions
        y_pred = best_model.predict(X_val)

        # Metrics
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_val, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_val, y_pred, average='macro', zero_division=0)

        all_accuracies.append(acc)
        all_precisions.append(prec)
        all_recalls.append(rec)
        all_f1_scores.append(f1)
        best_params_list.append(grid.best_params_)

        print(f"Accuracy : {acc:.4f}")
        print(f"Precision : {prec:.4f}")
        print(f"Recall : {rec:.4f}")
        print(f"F1 macro : {f1:.4f}")
        print(classification_report(y_val, y_pred, zero_division=0, target_names=class_names))

        # Confusion matrix (raw)
        cm = confusion_matrix(y_val, y_pred)
        all_conf_matrices.append(cm)

        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                    xticklabels=class_names, yticklabels=class_names)
        plt.title(f"Confusion Matrix (Fold {fold})")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.grid(False)
        plt.show()

    # Final summary 
    print("\nFinal Cross-Validation Summary:")

    mean_cm = np.mean(all_conf_matrices, axis=0).astype(int)
    cm_percent = np.round(mean_cm / mean_cm.sum(axis=1, keepdims=True) * 100, 1)

    # Raw average CM
    plt.figure(figsize=(6, 5))
    sns.heatmap(mean_cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    # Percentage CM
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix (Percentages)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    print(f"Mean Accuracy : {np.mean(all_accuracies):.4f}")
    print(f"Mean Precision : {np.mean(all_precisions):.4f}")
    print(f"Mean Recall : {np.mean(all_recalls):.4f}")
    print(f"Mean F1 macro : {np.mean(all_f1_scores):.4f}")
    print(f"± Std F1 macro : {np.std(all_f1_scores):.4f}")

    return all_f1_scores, best_params_list

In [None]:
ros = RandomOverSampler(random_state=42)
f1s_ros, params_ros = cross_val_with_oversampling(embeddings, labels, train_fn=train_rf_with_gridsearch, oversampler=ros, n_splits=10)

In [None]:
smote = SMOTE(random_state=42)
f1s_smote, params_smote = cross_val_with_oversampling(embeddings, labels, train_fn=train_rf_with_gridsearch, oversampler=smote, n_splits=10)

### GAN

In [None]:
def cross_val_with_gan(X, y, df_gan, label="GAN", n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    df_gan = df_gan.sample(frac=1, random_state=42).reset_index(drop=True)
    gan_chunks = np.array_split(df_gan, n_splits)

    f1_scores, acc_scores, prec_scores, rec_scores = [], [], [], []
    cm_matrices_pct = []
    cm_matrices_raw = []
    param_list = []

    for i, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"\nFold {i}/{n_splits}")

        # Split
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = np.array(y)[train_idx], np.array(y)[test_idx]

        # GAN augmentation
        df_gan_fold = gan_chunks[i-1]
        X_gan = np.array(df_gan_fold['embedding'].tolist())
        y_gan = np.array(df_gan_fold['label'].tolist())

        X_train_gan = np.concatenate((X_train, X_gan), axis=0)
        y_train_gan = np.concatenate((y_train, y_gan), axis=0)

        # Training
        grid = train_rf_with_gridsearch(X_train_gan, y_train_gan, X_test, y_test)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)

        # Scores
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

        acc_scores.append(acc)
        prec_scores.append(prec)
        rec_scores.append(rec)
        f1_scores.append(f1)
        param_list.append(grid.best_params_)

        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1 (macro): {f1:.4f}")

        # Confusion matrix (raw and percentage)
        cm_raw = confusion_matrix(y_test, y_pred)
        cm_matrices_raw.append(cm_raw)

        cm_pct = (cm_raw / cm_raw.sum(axis=1, keepdims=True)) * 100
        cm_matrices_pct.append(cm_pct)

        fig, ax = plt.subplots()
        disp = ConfusionMatrixDisplay(cm_raw, display_labels=np.unique(y))
        disp.plot(ax=ax, cmap='Blues', values_format="d", colorbar=False)
        ax.set_title(f"Confusion Matrix - Fold {i}")
        plt.grid(False)
        plt.show()

    # Average matrices
    mean_cm_raw = np.mean(cm_matrices_raw, axis=0).round().astype(int)
    mean_cm_pct = np.mean(cm_matrices_pct, axis=0).round(2)

    # Mean raw matrix
    fig, ax = plt.subplots()
    disp = ConfusionMatrixDisplay(mean_cm_raw, display_labels=np.unique(y))
    disp.plot(ax=ax, cmap='Blues', values_format="d", colorbar=False)
    ax.set_title(f"Mean Confusion Matrix - {label}")
    plt.grid(False)
    plt.show()

    # Mean percentage matrix
    fig, ax = plt.subplots()
    disp = ConfusionMatrixDisplay(mean_cm_pct, display_labels=np.unique(y))
    disp.plot(ax=ax, cmap='Blues', values_format=".2f", colorbar=False)
    ax.set_title(f"Mean Confusion Matrix (Percentage) - {label}")
    plt.grid(False)
    plt.show()

    # Score summary
    print(f"\nScore summary - {label}")
    print(f"Accuracy   : {np.mean(acc_scores):.4f}")
    print(f"Precision  : {np.mean(prec_scores):.4f}")
    print(f"Recall     : {np.mean(rec_scores):.4f}")
    print(f"F1 (macro) : {np.mean(f1_scores):.4f}")
    print(f"F1 Std Dev : {np.std(f1_scores):.4f}")

    return f1_scores, param_list

In [None]:
df_gan = read_pickle('path/Dataset/GAN_DF.pkl')

f1s_gan, params_gan = cross_val_with_gan(embeddings, labels, df_gan, n_splits=10)


### Without Augmentation

In [None]:
def cross_val_without_oversampling(X, y, train_fn, n_splits=5, class_names=["normal", "racism", "sexism"]):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    all_f1_scores = []
    all_accuracies = []
    all_precisions = []
    all_recalls = []
    all_conf_matrices = []
    best_params_list = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\nFold {fold} ----------------------------")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = np.array(y)[train_idx], np.array(y)[val_idx]

        # Model training
        grid = train_fn(X_train, y_train, X_val, y_val)
        best_model = grid.best_estimator_

        # Predictions
        y_pred = best_model.predict(X_val)

        # Metrics
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_val, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_val, y_pred, average='macro', zero_division=0)

        all_accuracies.append(acc)
        all_precisions.append(prec)
        all_recalls.append(rec)
        all_f1_scores.append(f1)
        best_params_list.append(grid.best_params_)

        print(f"Accuracy : {acc:.4f}")
        print(f"Precision : {prec:.4f}")
        print(f"Recall : {rec:.4f}")
        print(f"F1 macro : {f1:.4f}")
        print(classification_report(y_val, y_pred, zero_division=0, target_names=class_names))

        # Confusion matrix (raw)
        cm = confusion_matrix(y_val, y_pred)
        all_conf_matrices.append(cm)

        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                    xticklabels=class_names, yticklabels=class_names)
        plt.title(f"Confusion Matrix (Fold {fold})")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.grid(False)
        plt.show()

    # Final summary
    print("\nFinal Cross-Validation Summary:")

    mean_cm = np.mean(all_conf_matrices, axis=0).astype(int)
    cm_percent = np.round(mean_cm / mean_cm.sum(axis=1, keepdims=True) * 100, 1)

    # Raw average CM
    plt.figure(figsize=(6, 5))
    sns.heatmap(mean_cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    # Percentage CM
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix (Percentages)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    print(f"Mean Accuracy : {np.mean(all_accuracies):.4f}")
    print(f"Mean Precision : {np.mean(all_precisions):.4f}")
    print(f"Mean Recall : {np.mean(all_recalls):.4f}")
    print(f"Mean F1 macro : {np.mean(all_f1_scores):.4f}")
    print(f"± Std F1 macro : {np.std(all_f1_scores):.4f}")

    return all_f1_scores, best_params_list

In [None]:
f1s_no, params_no = cross_val_without_oversampling(embeddings, labels, train_fn=train_rf_with_gridsearch, n_splits=10)