In [None]:
# standard libraries
import numpy as np
import matplotlib.pyplot as plt

# additional libraries
import pandas as pd
import seaborn as sns
from pandas import read_pickle
from tqdm import tqdm

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay
)
from sklearn.preprocessing import LabelEncoder

# Imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# TensorFlow / Keras 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, Input, LSTM


In [None]:
df = pd.read_pickle('path/Dataset/Preprocessed_Dataset.pkl')

df_gan = read_pickle('path/Dataset/GAN_DF.pkl')

In [None]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('path/GloVe/glove.6B.200d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

400000it [00:21, 18196.10it/s]

Found 400000 word vectors.





In [33]:
X = df['text'].tolist()
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [34]:
all_texts = list(X_train) + list(X_test) + list(df_gan["text"])
token = Tokenizer(num_words=None, oov_token="<OOV>")
max_len = 40
token.fit_on_texts(all_texts)

In [35]:
xtrain_pad = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=max_len)
xvalid_pad = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=max_len)
xtrain_pad_gan = sequence.pad_sequences(token.texts_to_sequences(list(X_train) + list(df_gan["text"])), maxlen=max_len)

In [None]:
# encoding labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
y_train_gan = list(y_train) + list(df_gan["label"])
y_train_enc_gan = le.transform(y_train_gan)


In [37]:
word_index = token.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 200))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 13972/13972 [00:00<00:00, 571561.92it/s]


In [None]:
oov_words = []

for word in word_index:
    if word not in embeddings_index:
        oov_words.append(word)

print(f"Nombre de mots dans le vocabulaire : {len(word_index)}")
print(f"Nombre de mots OOV (non présents dans GloVe) : {len(oov_words)}")
print(f"Pourcentage OOV : {100 * len(oov_words) / len(word_index):.2f}%")

Nombre de mots dans le vocabulaire : 13972
Nombre de mots OOV (non présents dans GloVe) : 1248
Pourcentage OOV : 8.93%


In [None]:
def evaluate_model(model, x_test_pad, y_test_enc, model_name="Model"):
    # prediction
    y_pred_probs = model.predict(x_test_pad)
    y_pred = np.argmax(y_pred_probs, axis=1)

    # metrics
    accuracy = round(accuracy_score(y_test_enc, y_pred), 2)
    precision = round(precision_score(y_test_enc, y_pred, average='weighted', zero_division=0), 2)
    recall = round(recall_score(y_test_enc, y_pred, average='weighted', zero_division=0), 2)
    f1 = round(f1_score(y_test_enc, y_pred, average='weighted', zero_division=0), 2)

    print(f"\n--- Results {model_name} ---")
    print("confusion matrix :")
    print(confusion_matrix(y_test_enc, y_pred))
    ConfusionMatrixDisplay(confusion_matrix(y_test_enc, y_pred)).plot(cmap=plt.cm.Blues)

    print("\nclassification report :")
    print(classification_report(y_test_enc, y_pred, zero_division=0))

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")

    return None

In [40]:
def simple_lstm_model(vocab_size, embedding_matrix, max_len):
    input_layer = Input(shape=(max_len,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=200,
        weights=[embedding_matrix],
        trainable=False
    )(input_layer)
    rnn1 = LSTM(100, return_sequences=True)(embedding_layer)
    rnn2 = LSTM(50)(rnn1)
    output_layer = Dense(3, activation='sigmoid')(rnn2)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_lstm = simple_lstm_model(len(word_index) + 1, embedding_matrix, max_len)

In [None]:
from sklearn.model_selection import StratifiedKFold

def cross_val_lstm(X, y, build_model_fn, n_splits=5, epochs=5, batch_size=32, max_len=40, class_names=None):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_accuracies, all_precisions, all_recalls, all_f1s, all_conf_matrices = [], [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\nFold {fold} ----------------------------")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Build a new model for each fold
        model = build_model_fn()

        # Train
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict
        y_pred_probs = model.predict(X_val)
        y_pred = np.argmax(y_pred_probs, axis=1)

        # Metrics
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_val, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_val, y_pred, average='weighted', zero_division=0)

        all_accuracies.append(acc)
        all_precisions.append(prec)
        all_recalls.append(rec)
        all_f1s.append(f1)

        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
        print(classification_report(y_val, y_pred, target_names=class_names, zero_division=0))

        # Confusion matrix (raw)
        cm = confusion_matrix(y_val, y_pred)
        all_conf_matrices.append(cm)

        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                    xticklabels=class_names, yticklabels=class_names)
        plt.title(f"Confusion Matrix (Fold {fold})")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.grid(False)
        plt.show()

    mean_cm = np.mean(all_conf_matrices, axis=0).astype(int)
    cm_percent = np.round(mean_cm / mean_cm.sum(axis=1, keepdims=True) * 100, 1)
    
    # Raw average CM
    plt.figure(figsize=(6, 5))
    sns.heatmap(mean_cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    # Percentage CM
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix (Percentages)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    print("\nCross-validation summary:")
    print(f"Mean Accuracy: {np.mean(all_accuracies):.4f}")
    print(f"Mean Precision: {np.mean(all_precisions):.4f}")
    print(f"Mean Recall: {np.mean(all_recalls):.4f}")
    print(f"Mean F1: {np.mean(all_f1s):.4f}")
    print(f"Std F1: {np.std(all_f1s):.4f}")

    return

In [None]:
def build_lstm():
    return simple_lstm_model(len(word_index) + 1, embedding_matrix, max_len)

cross_val_lstm(xtrain_pad, y_train_enc, build_model_fn=build_lstm, n_splits=10, epochs=5, max_len=max_len, class_names=le.classes_)

### With ROS

In [None]:
def cross_val_lstm_with_ros(X, y, build_model_fn, n_splits=5, epochs=5, batch_size=32, max_len=40, class_names=None):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    ros = RandomOverSampler(random_state=42)

    all_accuracies, all_precisions, all_recalls, all_f1s = [], [], [], []
    all_conf_matrices = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\nFold {fold}/{n_splits}")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # ROS only on training data
        X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

        # new model for each fold
        model = build_model_fn()

        # training
        model.fit(X_train_res, y_train_res, epochs=epochs, batch_size=batch_size, verbose=0)

        # prediction
        y_pred_probs = model.predict(X_val)
        y_pred = np.argmax(y_pred_probs, axis=1)

        # evaluation
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_val, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_val, y_pred, average='weighted', zero_division=0)

        all_accuracies.append(acc)
        all_precisions.append(prec)
        all_recalls.append(rec)
        all_f1s.append(f1)

        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
        print(classification_report(y_val, y_pred, target_names=class_names, zero_division=0))

        # confusion matrix
        cm = confusion_matrix(y_val, y_pred)
        all_conf_matrices.append(cm)

        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                    xticklabels=class_names, yticklabels=class_names)
        plt.title(f"Confusion Matrix (Fold {fold})")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.grid(False)
        plt.show()

    # mean confusion matrix
    mean_cm = np.mean(all_conf_matrices, axis=0).astype(int)
    cm_percent = np.round(mean_cm / mean_cm.sum(axis=1, keepdims=True) * 100, 1)

    plt.figure(figsize=(6, 5))
    sns.heatmap(mean_cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix (Percentages)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    print("\nCross-validation summary (with RandomOverSampler):")
    print(f"Mean Accuracy: {np.mean(all_accuracies):.4f}")
    print(f"Mean Precision: {np.mean(all_precisions):.4f}")
    print(f"Mean Recall: {np.mean(all_recalls):.4f}")
    print(f"Mean F1: {np.mean(all_f1s):.4f}")
    print(f"Std F1: {np.std(all_f1s):.4f}")

    return

In [None]:
def build_lstm():
    return simple_lstm_model(len(word_index) + 1, embedding_matrix, max_len)

cross_val_lstm_with_ros(
    xtrain_pad,
    y_train_enc,
    build_model_fn=build_lstm,
    n_splits=10,
    epochs=5,
    class_names=le.classes_,
    max_len=max_len
)

## With GAN

In [None]:
def cross_val_lstm_with_gan(X, y, df_gan, build_model_fn, n_splits=5, epochs=5, batch_size=32, max_len=40, class_names=None):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    df_gan = df_gan.sample(frac=1, random_state=42).reset_index(drop=True)
    gan_chunks = np.array_split(df_gan, n_splits)

    all_accuracies, all_precisions, all_recalls, all_f1s = [], [], [], []
    all_conf_matrices = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\nFold {fold}/{n_splits}")

        # original data split
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # gan augmentation
        df_gan_fold = gan_chunks[fold - 1]
        X_gan = sequence.pad_sequences(token.texts_to_sequences(df_gan_fold['text']), maxlen=max_len)
        y_gan = le.transform(df_gan_fold['label'].tolist())

        # concatenate original and GAN data
        X_train_gan = np.concatenate((X_train, X_gan), axis=0)
        y_train_gan = np.concatenate((y_train, y_gan), axis=0)

        # new model for each fold
        model = build_model_fn()

        # training
        model.fit(X_train_gan, y_train_gan, epochs=epochs, batch_size=batch_size, verbose=0)

        # prediction
        y_pred_probs = model.predict(X_val)
        y_pred = np.argmax(y_pred_probs, axis=1)

        # evaluation
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_val, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_val, y_pred, average='weighted', zero_division=0)

        all_accuracies.append(acc)
        all_precisions.append(prec)
        all_recalls.append(rec)
        all_f1s.append(f1)

        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
        print(classification_report(y_val, y_pred, target_names=class_names, zero_division=0))

        # confusion matrix
        cm = confusion_matrix(y_val, y_pred)
        all_conf_matrices.append(cm)

        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                    xticklabels=class_names, yticklabels=class_names)
        plt.title(f"Confusion Matrix (Fold {fold})")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.grid(False)
        plt.show()

    # mean confusion matrix
    mean_cm = np.mean(all_conf_matrices, axis=0).astype(int)
    cm_percent = np.round(mean_cm / mean_cm.sum(axis=1, keepdims=True) * 100, 1)

    plt.figure(figsize=(6, 5))
    sns.heatmap(mean_cm, annot=True, fmt='d', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap="Blues", cbar=False,
                xticklabels=class_names, yticklabels=class_names)
    plt.title("Mean Confusion Matrix (Percentages)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.grid(False)
    plt.show()

    print("\nCross-validation summary (with GAN):")
    print(f"Mean Accuracy: {np.mean(all_accuracies):.4f}")
    print(f"Mean Precision: {np.mean(all_precisions):.4f}")
    print(f"Mean Recall: {np.mean(all_recalls):.4f}")
    print(f"Mean F1: {np.mean(all_f1s):.4f}")
    print(f"Std F1: {np.std(all_f1s):.4f}")

    return


In [None]:
def build_lstm():
    return simple_lstm_model(len(word_index) + 1, embedding_matrix, max_len)

cross_val_lstm_with_gan(
    xtrain_pad,
    y_train_enc,
    build_model_fn=build_lstm,
    df_gan=df_gan,
    n_splits=10,
    epochs=5,
    class_names=le.classes_,
    max_len=max_len
)