## Imports

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, issparse
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    confusion_matrix,
)

import lightgbm as lgb

from IPython.display import display
import ipywidgets as widgets

import os

import pickle


---

## 🔢 Feature Extraction

<!-- TODO: Explain -->

| Model               | Best Feature Extraction Method |
| :------------------ | :----------------------------- |
| Random Forest       | TF-IDF + Chi-Square            |
| Logistic Regression | TF-IDF + Information Gain      |
| Gradient Boosting   | GloVe + PCA                    |
<!-- | SVM                 | TF-IDF + BoW                   |
| DistilBERT          | Fine-tuned BERT embeddings     | -->


In [None]:
final_train_df = pd.read_csv("dataset_and_corpora/augmented_train_senti.csv")


In [None]:
def save_vectorizer(model, filename, folder="vectorizers"):
    """Save a model or object to the specified folder."""
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, filename)
    with open(filepath, "wb") as f:
        pickle.dump(model, f)
    print(f"[INFO] Saved model to {filepath}")


### TF-IDF Feature Extarction

TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used in natural language processing and information retrieval to evaluate the importance of a word in a document relative to a collection of documents (corpus).


In [None]:
def extract_tfidf_features(df: pd.DataFrame, max_features: int = 5000) -> csr_matrix:
    """
    Extract TF-IDF features from the 'content' column of the DataFrame.

    :param df: Input DataFrame with a 'content' column
    :param max_features: Maximum number of features to extract
    :return: TF-IDF features as a sparse matrix
    """

    progress = widgets.IntProgress(value=0, min=0, max=1, description="TF-IDF:")
    display(progress)

    df["content"] = df["content"].fillna("")
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_features = csr_matrix(vectorizer.fit_transform(df["content"]))
    save_vectorizer(vectorizer, "final_tfidf_vectorizer.pkl")

    progress.value = 1
    return tfidf_features


#### Chi-Square Feature Extraction

=yes()

In [None]:
def extract_chi2_features(
    tfidf_features: csr_matrix, labels: np.ndarray, k: int = 5000
) -> csr_matrix:
    """
    Extract top k features using the Chi-Square test.

    :param tfidf_features: TF-IDF features as a sparse matrix
    :param labels: Labels corresponding to the features
    :param k: Number of top features to select
    :return: Reduced feature set as a sparse matrix
    """
    progress = widgets.IntProgress(value=0, min=0, max=1, description="Chi2:")
    display(progress)

    chi2_selector = SelectKBest(chi2, k=k)
    chi2_features = csr_matrix(chi2_selector.fit_transform(tfidf_features, labels))

    save_vectorizer(chi2_selector, "final_chi2_selector.pkl")
    progress.value = 1
    return chi2_features


#### Bag of Words Feature Extraction

=yes()

In [None]:
def extract_bow_features(tfidf_vectorizer: TfidfVectorizer, df: pd.DataFrame):
    """
    Extract Bag of Words (BoW) features based on the vocabulary of a fitted TF-IDF vectorizer.
    """

    progress = widgets.IntProgress(value=0, min=0, max=1, description="BoW:")
    display(progress)

    df["content"] = df["content"].fillna("")

    bow_vectorizer = CountVectorizer(vocabulary=tfidf_vectorizer.vocabulary_)

    bow_features = bow_vectorizer.fit_transform(df["content"])

    save_vectorizer(bow_vectorizer, "final_bow_vectorizer.pkl")

    progress.value = 1
    return bow_features


#### Information Gain Feature Extraction

=yes()

In [None]:
def extract_information_gain_features(
    tfidf_features: csr_matrix, labels: np.ndarray, k: int = 5000
) -> np.ndarray:
    """
    Extract top k features using Information Gain (Mutual Information)
    and return (reduced features as array, fitted SelectKBest object).
    """
    progress = widgets.IntProgress(value=0, min=0, max=1, description="InfoGain:")
    display(progress)

    labels = labels.astype(int)

    infogain_selector = SelectKBest(score_func=mutual_info_classif, k=k)
    reduced_features = infogain_selector.fit_transform(tfidf_features, labels)

    save_vectorizer(infogain_selector, "final_infogain_selector.pkl")

    progress.value = 1
    return reduced_features


### GloVe Feature Extraction

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

**Introduced in** Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/pubs/glove.pdf).

Before running the cell, please download and move the pre-trained word vectors (Wikipedia 2014 + Gigaword 5) from [here](https://nlp.stanford.edu/data/glove.6B.zip) to the `datasets_and_corpora` folder

In [None]:
def extract_glove_features(
    df: pd.DataFrame,
    glove_path: str = "dataset_and_corpora/glove.6B.100d.txt",
    embedding_dim: int = 100,
) -> np.ndarray:
    """
    Extract GloVe features from the 'content' column of the DataFrame.

    :param df: Input DataFrame with a 'content' column
    :param glove_path: Path to the GloVe embeddings file
    :param embedding_dim: Dimension of the GloVe embeddings
    :return: GloVe features as a NumPy array
    """
    progress = widgets.IntProgress(value=0, min=0, max=len(df), description="GloVe:")
    display(progress)

    # Load GloVe embeddings
    glove_embeddings = {}
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            glove_embeddings[word] = vector

    # Compute sentence embeddings
    sentences = [content.split() for content in df["content"]]
    glove_features = np.array(
        [
            np.mean(
                [
                    glove_embeddings[word]
                    for word in sentence
                    if word in glove_embeddings
                ]
                or [np.zeros(embedding_dim, dtype="float32")],
                axis=0,
            )
            for sentence in sentences
        ]
    )

    for _ in range(len(df)):
        progress.value += 1

    return glove_features


#### PCA Feature Extraction

=yes()

In [None]:
def perform_pca(features: np.ndarray, n_components: int = 100) -> np.ndarray:
    """
    Perform PCA on the given features to reduce dimensionality.

    :param features: Input features as a NumPy array
    :param n_components: Number of principal components to retain
    :return: Reduced features as a NumPy array
    """
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(features)

    save_vectorizer(pca, "final_pca_model.pkl")

    return reduced_features


---

## 🌲 Gradient Boosting (LightGBM)

Gradient Boosting is a state-of-the-art ensemble machine learning technique that builds a strong predictive model by combining the outputs of many weak learners, typically decision trees, in a sequential manner. LightGBM (Light Gradient Boosting Machine) is a highly efficient and scalable implementation of gradient boosting, optimized for both speed and performance, and supports GPU acceleration for large-scale data mining tasks. LightGBM is particularly effective for classification problems such as fake news detection, where complex feature interactions and high-dimensional data are common. In this implementation, we use the [LightGBM](https://lightgbm.readthedocs.io/en/latest/) library with GPU support and advanced hyperparameters for robust model training and evaluation.

The following code performs:
1. _[Feature Loading](#loading-glove--pca-features):_ Loads preprocessed features ([GloVe word embeddings reduced by PCA](#pca-feature-extraction)) to provide dense, informative representations of text data.
2. _[K-Fold Cross-Validation](#test-lightgbm-with-k-fold-cross-validation):_ Uses 5-fold cross-validation to rigorously assess model performance, ensuring that results are robust and generalizable across different data splits.
3. _Ensemble Predictions:_ Stores probabilistic predictions (in [`predictions/lightgbm_ensemble_predictions.pkl`](predictions/lightgbm_ensemble_predictions.pkl)) from all folds, enabling ensemble analysis and further model comparison.
4. _[Final Model](#train-lightgbm-on-training-set):_ Trains a final LightGBM model on the entire dataset, saves the trained model (in [`models/lgbm_model.txt`](models/lgbm_model.txt)), and records the training loss curve for further evaluation and visualization.


### Loading Features

In [None]:
with open("features/glove_pca_features.pkl", "rb") as file:
    glove_pca_features = pickle.load(file)

with open("features/tfidf_bow_features.pkl", "rb") as file:
    tfidf_bow_features = pickle.load(file).astype(np.float32)


with open("features/tfidf_chi2_features.pkl", "rb") as file:
    tfidf_chi2_features = pickle.load(file).astype(np.float32)

with open("features/tfidf_info_gain_features.pkl", "rb") as file:
    tfidf_info_gain_features = pickle.load(file).astype(np.float32)

labels = final_train_df["label"].to_numpy()


### Test LightGBM with K-Fold Cross-Validation

In [None]:
def lgbm_KCross_Val(name, features):
    k = 5
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    fold_accuracies = []
    ensemble_predictions = np.zeros((len(labels), 2))

    print(f"\n==== k cross fold eval on {name} ====\n")
    for fold, (train_idx, test_idx) in enumerate(kf.split(features)):
        print(f"Fold {fold + 1}/{k}")

        X_train, X_test = features[train_idx], features[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]

        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

        params = {
            "objective": "multiclass",
            "num_class": 2,
            "boosting_type": "gbdt",
            "metric": "multi_logloss",
            "device": "gpu",
            "gpu_platform_id": 0,
            "gpu_device_id": 0,
            "learning_rate": 0.01,
            "num_leaves": 31,
            "max_depth": -1,
            "min_data_in_leaf": 20,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "verbosity": -1,
            "early_stopping_rounds": 50,
        }

        lgb_model = lgb.train(
            params,
            train_data,
            num_boost_round=10000,
            valid_sets=[train_data, test_data],
            callbacks=[lgb.early_stopping(stopping_rounds=100)],
        )

        if lgb_model is None:
            raise ValueError(
                "LightGBM model training failed. Please check the training data and parameters."
            )

        batch_size = 1024

        def lazy_predict(X_test, batch_size):
            if lgb_model is None:
                raise ValueError(
                    "LightGBM model is not initialized. Ensure the model is trained before prediction."
                )
            for i in range(0, len(X_test), batch_size):
                batch = X_test[i : i + batch_size]
                yield lgb_model.predict(batch, num_iteration=lgb_model.best_iteration)

        y_pred_proba = np.vstack(
            [np.array(batch) for batch in lazy_predict(X_test, batch_size)]
        )
        y_pred = np.argmax(y_pred_proba, axis=1)

        ensemble_predictions[test_idx] = y_pred_proba

        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)
        print(f"Accuracy for fold {fold + 1}: {accuracy:.4f}")
        print("\nClassification Report:\n", classification_report(y_test, y_pred))

    average_accuracy = sum(fold_accuracies) / len(fold_accuracies)
    print(f"\nAverage Accuracy across {k} folds: {average_accuracy:.4f}\n")

    with open(f"predictions/lightgbm_w_{name}_ensemble_predictions.pkl", "wb") as file:
        pickle.dump(ensemble_predictions, file)


In [None]:
lgbm_KCross_Val("glove_pca", glove_pca_features)


In [None]:
lgbm_KCross_Val("tfidf_bow", tfidf_bow_features.toarray())


In [None]:
lgbm_KCross_Val("tfidf_chi2", tfidf_chi2_features.toarray())


In [None]:
lgbm_KCross_Val("tfidf_infogain", tfidf_info_gain_features.toarray())


### Train LightGBM on Training Set

In [None]:
glove_pca_train_data = lgb.Dataset(glove_pca_features, label=labels)
tfidf_chi2_train_data = lgb.Dataset(tfidf_chi2_features, label=labels)
tfidf_bow_train_data = lgb.Dataset(tfidf_bow_features, label=labels)
tfidf_info_gain_train_data = lgb.Dataset(tfidf_info_gain_features, label=labels)


def full_train(name, train_data):
    evals_result = {}

    params = {
        "objective": "multiclass",
        "num_class": 2,
        "boosting_type": "gbdt",
        "metric": "multi_logloss",
        "device": "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0,
        "learning_rate": 0.01,
        "num_leaves": 31,
        "max_depth": -1,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbosity": -1,
    }

    final_lgb_model = lgb.train(
        params,
        train_data,
        num_boost_round=10000,
        valid_sets=[train_data],
        valid_names=["train"],
        callbacks=[
            lgb.record_evaluation(evals_result),
        ],
    )

    # Save model
    final_lgb_model.save_model(f"models/lgbm_{name}_model.txt")

    plt.figure(figsize=(8, 5))
    plt.plot(evals_result["train"]["multi_logloss"], label="Train Loss")
    plt.title("Final LightGBM Model - Training Loss Curve")
    plt.xlabel("Boosting Round")
    plt.xlim(0, len(evals_result["train"]["multi_logloss"]))
    plt.ylabel("Multi Log Loss")
    plt.ylim(0, max(evals_result["train"]["multi_logloss"]) * 1.1)
    plt.legend()
    plt.grid(True)
    plt.savefig("evaluations/lgbm_training_loss_curve.png")
    plt.show()


full_train("glove_pca", glove_pca_train_data)
full_train("tfidf_bow", tfidf_bow_train_data)
# full_train("tfidf_chi2", tfidf_chi2_train_data)
# full_train("tfidf_infogain", tfidf_info_gain_train_data)


# 4. 🍎 Testing Testing



## Test Set Feature Extraction

In [None]:
class TestFeatureExtractor:
    def __init__(self):
        """Load all pre-fitted vectorizers, selectors, and models needed for test feature extraction."""
        print("[INFO] Loading feature extractors...")

        # Load TF-IDF Vectorizer
        with open("vectorizers/final_tfidf_vectorizer.pkl", "rb") as f:
            self.tfidf_vectorizer = pickle.load(f)

        # Load Chi2 Selector
        with open("vectorizers/final_chi2_selector.pkl", "rb") as f:
            self.chi2_selector = pickle.load(f)

        # Load BoW Vectorizer
        with open("vectorizers/final_bow_vectorizer.pkl", "rb") as f:
            self.bow_vectorizer = pickle.load(f)

        # Load InfoGain Selector
        with open("vectorizers/final_infogain_selector.pkl", "rb") as f:
            self.infogain_selector = pickle.load(f)

        # Load GloVe Embeddings (correct way)
        self.glove_embeddings = self.load_glove_embeddings(
            "dataset_and_corpora/glove.6B.100d.txt"
        )

        # Load PCA model for GloVe
        with open("vectorizers/final_pca_model.pkl", "rb") as f:
            self.pca_model = pickle.load(f)

    def load_glove_embeddings(self, glove_file_path):
        """Load GloVe embeddings from a text file into a dictionary."""
        print("[INFO] Loading GloVe embeddings...")
        embeddings = {}
        with open(glove_file_path, "r", encoding="utf-8") as f:
            for line in f:
                values = line.strip().split()
                word = values[0]
                vector = np.asarray(values[1:], dtype=np.float32)
                embeddings[word] = vector
        print(f"[INFO] Loaded {len(embeddings)} word vectors from GloVe.")
        return embeddings

    def transform_tfidf(self, df):
        """Transform data using pre-fitted TF-IDF vectorizer."""
        return self.tfidf_vectorizer.transform(df["content"])

    def transform_bow(self, df, max_features=5000):
        """Transform data using pre-fitted BoW vectorizer."""
        bow_matrix = self.bow_vectorizer.transform(df["content"])
        return bow_matrix.toarray()[:, :max_features]

    def transform_chi2(self, tfidf_features):
        """Transform data using Chi2 selector."""
        return self.chi2_selector.transform(tfidf_features)

    def transform_infogain(self, tfidf_features):
        """Transform data using InfoGain selector."""
        return self.infogain_selector.transform(tfidf_features)

    def transform_glove(self, df, embedding_dim=100):
        """Extract GloVe sentence embeddings."""
        sentences = [content.split() for content in df["content"]]
        glove_features = np.array(
            [
                np.mean(
                    [
                        self.glove_embeddings.get(word, np.zeros(embedding_dim))
                        for word in sentence
                    ]
                    or [np.zeros(embedding_dim)],
                    axis=0,
                )
                for sentence in sentences
            ]
        )
        return glove_features

    def transform_pca_on_glove(self, glove_features):
        """Reduce GloVe embeddings using pre-fitted PCA."""
        return self.pca_model.transform(glove_features)

    def extract_all_features(self, df):
        """Main function to transform test data for all models."""
        tfidf_features = self.transform_tfidf(df)

        features = {
            "bow": self.transform_bow(df),
            "chi2": self.transform_chi2(tfidf_features),
            "infogain": self.transform_infogain(tfidf_features),
        }

        glove_features = self.transform_glove(df)
        glove_pca_features = self.transform_pca_on_glove(glove_features)
        features["glove_pca"] = glove_pca_features

        return features


In [None]:
test_df = pd.read_csv("dataset_and_corpora/cleaned_test.csv")

feature_extractor = TestFeatureExtractor()

test_labels = pd.read_csv("dataset_and_corpora/cleaned_test.csv")["label"].to_numpy()

features = feature_extractor.extract_all_features(test_df)


## Loading Models an Stroing Features

In [None]:
glove_pca_lgbm_model = lgb.Booster(model_file="models/lgbm_glove_pca_model.txt")
tfidf_bow_lgbm_model = lgb.Booster(model_file="models/lgbm_tfidf_bow_model.txt")

X_bow = features["bow"]
X_glove_pca = features["glove_pca"]
y_true = test_df["label"].to_numpy()


## Individual Model Evaluation Function

In [None]:
def evaluate_model(name, y_true, y_pred):
    """Evaluate the performance of a model."""
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    matrix = confusion_matrix(y_true, y_pred)

    print(f"\n=== {name} Evaluation ===")

    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", matrix)

    # Save textual metrics
    with open(f"evaluations/{name}_metrics.txt", "w") as f:
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1 Score: {f1:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(str(report))
        f.write("\nConfusion Matrix:\n")
        f.write(np.array2string(matrix))

    print(f"[INFO] Saved metrics to {name}_metrics.txt\n")

    # Plot and save confusion matrix with percentages
    cm_percent = matrix.astype("float") / matrix.sum(axis=1)[:, np.newaxis] * 100
    annot = np.array([[f"{val:.2f}%" for val in row] for row in cm_percent])

    plt.figure(figsize=(6, 5))
    ax = sns.heatmap(
        cm_percent,
        annot=annot,
        fmt="",
        cmap="Blues",
        cbar=True,
        xticklabels=["Pred Fake", "Pred True"],
        yticklabels=["Actual Fake", "Actual True"],
    )
    ax.xaxis.set_ticks_position("top")
    ax.xaxis.set_label_position("top")
    plt.title(f"{name}", pad=40)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.tight_layout()
    plt.savefig(f"assets/confusion_matrix_{name}.png")
    plt.show()


In [None]:
# === GloVe+PCA ===
lgbm_probs = glove_pca_lgbm_model.predict(X_glove_pca)

if issparse(lgbm_probs):
    lgbm_probs = lgbm_probs.toarray()  # type: ignore
elif isinstance(lgbm_probs, list) and all(issparse(m) for m in lgbm_probs):
    lgbm_probs = np.vstack([m.toarray() for m in lgbm_probs])  # type: ignore
else:
    lgbm_probs = np.asarray(lgbm_probs)

# Handle 2D probability arrays (binary classification: [prob_0, prob_1])
if lgbm_probs.ndim == 2 and lgbm_probs.shape[1] == 2:
    lgbm_probs = lgbm_probs[:, 1]

# Threshold to get binary prediction
y_pred_lgbm = (lgbm_probs >= 0.5).astype(int)

# Evaluate
evaluate_model("LightGBM with GloVe+PCA", y_true, y_pred_lgbm)

# === TF-IDF+BoW ===
lgbm_probs = tfidf_bow_lgbm_model.predict(X_bow)

if issparse(lgbm_probs):
    lgbm_probs = lgbm_probs.toarray()  # type: ignore
elif isinstance(lgbm_probs, list) and all(issparse(m) for m in lgbm_probs):
    lgbm_probs = np.vstack([m.toarray() for m in lgbm_probs])  # type: ignore
else:
    lgbm_probs = np.asarray(lgbm_probs)

# Handle 2D probability arrays (binary classification: [prob_0, prob_1])
if lgbm_probs.ndim == 2 and lgbm_probs.shape[1] == 2:
    lgbm_probs = lgbm_probs[:, 1]

# Threshold to get binary prediction
y_pred_lgbm = (lgbm_probs >= 0.5).astype(int)

# Evaluate
evaluate_model("LightGBM with TF-IDF+BoW", y_true, y_pred_lgbm)
