## Imports

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    confusion_matrix,
)

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

from IPython.display import display
import ipywidgets as widgets

import os

import pickle


---

## 🔢 Feature Extraction

<!-- TODO: Explain -->

| Model               | Best Feature Extraction Method |
| :------------------ | :----------------------------- |
| Random Forest       | TF-IDF + Chi-Square            |
| Logistic Regression | TF-IDF + Information Gain      |
| Gradient Boosting   | GloVe + PCA                    |
<!-- | SVM                 | TF-IDF + BoW                   |
| DistilBERT          | Fine-tuned BERT embeddings     | -->


In [None]:
final_train_df = pd.read_csv("dataset_and_corpora/augmented_train_senti.csv")


In [None]:
def save_vectorizer(model, filename, folder="vectorizers"):
    """Save a model or object to the specified folder."""
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, filename)
    with open(filepath, "wb") as f:
        pickle.dump(model, f)
    print(f"[INFO] Saved model to {filepath}")


### TF-IDF Feature Extarction

TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used in natural language processing and information retrieval to evaluate the importance of a word in a document relative to a collection of documents (corpus).


In [None]:
def extract_tfidf_features(df: pd.DataFrame, max_features: int = 5000) -> csr_matrix:
    """
    Extract TF-IDF features from the 'content' column of the DataFrame.

    :param df: Input DataFrame with a 'content' column
    :param max_features: Maximum number of features to extract
    :return: TF-IDF features as a sparse matrix
    """

    progress = widgets.IntProgress(value=0, min=0, max=1, description="TF-IDF:")
    display(progress)

    df["content"] = df["content"].fillna("")
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_features = csr_matrix(vectorizer.fit_transform(df["content"]))
    save_vectorizer(vectorizer, "final_tfidf_vectorizer.pkl")

    progress.value = 1
    return tfidf_features


#### Chi-Square Feature Extraction

=yes()

In [None]:
def extract_chi2_features(
    tfidf_features: csr_matrix, labels: np.ndarray, k: int = 5000
) -> csr_matrix:
    """
    Extract top k features using the Chi-Square test.

    :param tfidf_features: TF-IDF features as a sparse matrix
    :param labels: Labels corresponding to the features
    :param k: Number of top features to select
    :return: Reduced feature set as a sparse matrix
    """
    progress = widgets.IntProgress(value=0, min=0, max=1, description="Chi2:")
    display(progress)

    chi2_selector = SelectKBest(chi2, k=k)
    chi2_features = csr_matrix(chi2_selector.fit_transform(tfidf_features, labels))

    save_vectorizer(chi2_selector, "final_chi2_selector.pkl")
    progress.value = 1
    return chi2_features


#### Bag of Words Feature Extraction

=yes()

In [None]:
def extract_bow_features(tfidf_vectorizer: TfidfVectorizer, df: pd.DataFrame):
    """
    Extract Bag of Words (BoW) features based on the vocabulary of a fitted TF-IDF vectorizer.
    """

    progress = widgets.IntProgress(value=0, min=0, max=1, description="BoW:")
    display(progress)

    df["content"] = df["content"].fillna("")

    bow_vectorizer = CountVectorizer(vocabulary=tfidf_vectorizer.vocabulary_)

    bow_features = bow_vectorizer.fit_transform(df["content"])

    save_vectorizer(bow_vectorizer, "final_bow_vectorizer.pkl")

    progress.value = 1
    return bow_features


#### Information Gain Feature Extraction

=yes()

In [None]:
def extract_information_gain_features(
    tfidf_features: csr_matrix, labels: np.ndarray, k: int = 5000
) -> np.ndarray:
    """
    Extract top k features using Information Gain (Mutual Information)
    and return (reduced features as array, fitted SelectKBest object).
    """
    progress = widgets.IntProgress(value=0, min=0, max=1, description="InfoGain:")
    display(progress)

    labels = labels.astype(int)

    infogain_selector = SelectKBest(score_func=mutual_info_classif, k=k)
    reduced_features = infogain_selector.fit_transform(tfidf_features, labels)

    save_vectorizer(infogain_selector, "final_infogain_selector.pkl")

    progress.value = 1
    return reduced_features


### GloVe Feature Extraction

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

**Introduced in** Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/pubs/glove.pdf).

Before running the cell, please download and move the pre-trained word vectors (Wikipedia 2014 + Gigaword 5) from [here](https://nlp.stanford.edu/data/glove.6B.zip) to the `datasets_and_corpora` folder

In [None]:
def extract_glove_features(
    df: pd.DataFrame,
    glove_path: str = "dataset_and_corpora/glove.6B.100d.txt",
    embedding_dim: int = 100,
) -> np.ndarray:
    """
    Extract GloVe features from the 'content' column of the DataFrame.

    :param df: Input DataFrame with a 'content' column
    :param glove_path: Path to the GloVe embeddings file
    :param embedding_dim: Dimension of the GloVe embeddings
    :return: GloVe features as a NumPy array
    """
    progress = widgets.IntProgress(value=0, min=0, max=len(df), description="GloVe:")
    display(progress)

    # Load GloVe embeddings
    glove_embeddings = {}
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            glove_embeddings[word] = vector

    # Compute sentence embeddings
    sentences = [content.split() for content in df["content"]]
    glove_features = np.array(
        [
            np.mean(
                [
                    glove_embeddings[word]
                    for word in sentence
                    if word in glove_embeddings
                ]
                or [np.zeros(embedding_dim, dtype="float32")],
                axis=0,
            )
            for sentence in sentences
        ]
    )

    for _ in range(len(df)):
        progress.value += 1

    return glove_features


#### PCA Feature Extraction

=yes()

In [None]:
def perform_pca(features: np.ndarray, n_components: int = 100) -> np.ndarray:
    """
    Perform PCA on the given features to reduce dimensionality.

    :param features: Input features as a NumPy array
    :param n_components: Number of principal components to retain
    :return: Reduced features as a NumPy array
    """
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(features)

    save_vectorizer(pca, "final_pca_model.pkl")

    return reduced_features


---

## 📉 Logistic Regression

Logistic Regression is a fundamental linear machine learning algorithm widely used for binary and multiclass classification tasks. It models the probability that a given input belongs to a particular class by applying the logistic (sigmoid) function to a linear combination of input features. Logistic Regression is especially valued for its simplicity, interpretability, and efficiency, making it a strong baseline for text classification problems such as fake news detection. In this implementation, a custom logistic regression model is built using PyTorch’s neural network modules, allowing for GPU acceleration and flexible training routines.

The following code performs:
1. _[Feature Loading](#loading-tf-idf--information-gain-features-and-converting-to-pytorch-tensors):_ Loads the preprocessed features ([TF-IDF vectors selected by Information Gain feature selection](#information-gain-feature-extraction)), converting them into PyTorch tensors for model training.
2. _[K-Fold Cross-Validation](#test-lr-with-k-fold-cross-validation):_ Uses 5-fold cross-validation to rigorously evaluate the model’s performance, ensuring that results are robust and generalizable.
3. _Ensemble Predictions:_ Stores the probabilistic predictions (in [`predictions/lr_ensemble_predictions.pkl`](predictions/lr_ensemble_predictions.pkl)) from all folds, enabling ensemble analysis and comprehensive performance assessment.
4. _[Final Model](#train-lr-on-training-set):_ Trains a final Logistic Regression model on the entire dataset with early stopping based on training loss, and saves the best-performing model (in [`models/lr_model.pth`](models/lr_model.pth)) for future inference or deployment. The training loss per epoch is also saved and visualized for further analysis.




### Loading TF-IDF + Information Gain Features and Converting to PyTorch tensors

In [None]:
labels = final_train_df["label"].to_numpy()
t_labels = torch.tensor(labels, dtype=torch.long)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


### Custom LR Model

In [None]:
class LR_Model(nn.Module):
    def __init__(self, input_dim):
        super(LR_Model, self).__init__()
        self.fc = nn.Linear(input_dim, 2)

    def forward(self, x):
        return self.fc(x)


### Test LR with K-Fold Cross-Validation

In [None]:
def lr_KCorss_Val(name, features, t_features):
    k = 5
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    fold_accuracies = []
    ensemble_predictions = torch.zeros(
        (len(labels), 2), dtype=torch.float32, device=device
    )
    print(f"\n==== k cross fold eval on {name} ====\n")
    for fold, (train_idx, test_idx) in enumerate(kf.split(features)):
        print(f"\nFold {fold + 1}/{k}")

        X_train, X_test = (
            t_features[train_idx],
            t_features[test_idx],
        )
        y_train, y_test = t_labels[train_idx], t_labels[test_idx]

        train_dataset = TensorDataset(X_train, y_train)
        test_dataset = TensorDataset(X_test, y_test)

        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

        lr_model = LR_Model(input_dim=X_train.shape[1]).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(lr_model.parameters(), lr=0.001)

        num_epochs = 10

        for epoch in range(num_epochs):
            lr_model.train()
            running_loss = 0.0
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                optimizer.zero_grad()
                outputs = lr_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss:.4f}")

        # Save model for this fold
        torch.save(
            lr_model.state_dict(),
            f"models/lr_epochs/{name}_lr_fold{fold + 1}.pth",
        )

        lr_model.eval()
        all_outputs = []
        all_targets = []
        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = lr_model(batch_X)
                all_outputs.append(outputs)
                all_targets.append(batch_y)

        outputs = torch.cat(all_outputs)
        y_test = torch.cat(all_targets)

        _, y_pred = torch.max(outputs, 1)
        y_pred_proba = torch.softmax(outputs, dim=1)

        ensemble_predictions[test_idx] = y_pred_proba

        accuracy = accuracy_score(y_test.cpu(), y_pred.cpu())
        fold_accuracies.append(accuracy)

        print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
        print(classification_report(y_test.cpu(), y_pred.cpu()))

    # Save ensemble predictions
    with open(f"predictions/lr_w_{name}_ensemble_predictions.pkl", "wb") as file:
        pickle.dump(ensemble_predictions.cpu().numpy(), file)

    # Print average accuracy
    average_accuracy = sum(fold_accuracies) / k
    print(f"\nAverage Accuracy across {k} folds: {average_accuracy:.4f}")


### Train LR on Training Set

In [None]:
print("\nTraining final model on full dataset with early stopping...")


def full_train(name, features, t_features):
    full_train_dataset = TensorDataset(t_features, t_labels)
    full_train_loader = DataLoader(full_train_dataset, batch_size=32, shuffle=True)

    final_lr_model = LR_Model(input_dim=features.shape[1]).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(final_lr_model.parameters(), lr=0.001)

    num_epochs = 100
    patience = 5
    best_loss = float("inf")
    epochs_without_improvement = 0

    train_losses = []

    print(f"\n==== Training final model on {name} ====\n")
    for epoch in range(num_epochs):
        final_lr_model.train()
        running_loss = 0.0
        for batch_X, batch_y in full_train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = final_lr_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_loss = running_loss / len(full_train_loader)
        train_losses.append(avg_loss)

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

        # Early stopping check
        if avg_loss < best_loss - 1e-4:
            best_loss = avg_loss
            epochs_without_improvement = 0
            torch.save(
                final_lr_model,
                f"models/lr_{name}_model.pth",
            )
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print(
                f"Early stopping triggered after {patience} epochs without improvement."
            )
            break

    # Save the loss per epoch data
    with open(f"assets/lr_w_{name}_training_loss_per_epoch.pkl", "wb") as file:
        pickle.dump(train_losses, file)

    print("\nTraining complete. Best model saved.")

    # Now plot the losses
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(train_losses) + 1), train_losses, marker="o")
    plt.title(f"Training Loss per Epoch for {name}")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.ylim(0, max(train_losses) * 1.1)
    plt.grid(True)
    plt.savefig(f"evaluations/lr_w_{name}_training_loss.png")
    plt.show()


In [None]:
with open("features/tfidf_bow_features.pkl", "rb") as file:
    tfidf_bow_features = pickle.load(file).astype(np.float32)

t_tfidf_bow_features = torch.tensor(tfidf_bow_features.todense(), dtype=torch.float32)

lr_KCorss_Val(
    "tfidf_bow_features",
    tfidf_bow_features,
    t_tfidf_bow_features,
)

full_train("tfidf_bow", tfidf_bow_features, t_tfidf_bow_features)
del tfidf_bow_features, t_tfidf_bow_features


In [None]:
with open("features/tfidf_chi2_features.pkl", "rb") as file:
    tfidf_chi2_features = pickle.load(file).astype(np.float32)

t_tfidf_chi2_features = torch.tensor(tfidf_chi2_features.todense(), dtype=torch.float32)

lr_KCorss_Val(
    "tfidf_chi2_features",
    tfidf_chi2_features,
    t_tfidf_chi2_features,
)

full_train("tfidf_chi2", tfidf_chi2_features, t_tfidf_chi2_features)

del tfidf_chi2_features, t_tfidf_chi2_features


In [None]:
with open("features/tfidf_infogain_features.pkl", "rb") as file:
    tfidf_infogain_features = pickle.load(file).astype(np.float32)

t_tfidf_infogain_features = torch.tensor(
    tfidf_infogain_features.todense(), dtype=torch.float32
)

lr_KCorss_Val(
    "tfidf_infogain_features",
    tfidf_infogain_features,
    t_tfidf_infogain_features,
)

full_train("tfidf_infogain", tfidf_infogain_features, t_tfidf_infogain_features)

del tfidf_infogain_features, t_tfidf_infogain_features


In [None]:
with open("features/glove_pca_features.pkl", "rb") as file:
    glove_pca_features = pickle.load(file)

t_glove_pca_features = torch.tensor(glove_pca_features, dtype=torch.float32)

lr_KCorss_Val(
    "glove_pca_features",
    glove_pca_features,
    t_glove_pca_features,
)

full_train("glove_pca", glove_pca_features, t_glove_pca_features)

del glove_pca_features, t_glove_pca_features


# 4. 🍎 Testing Testing



## Test Set Feature Extraction

In [None]:
class TestFeatureExtractor:
    def __init__(self):
        """Load all pre-fitted vectorizers, selectors, and models needed for test feature extraction."""
        print("[INFO] Loading feature extractors...")

        # Load TF-IDF Vectorizer
        with open("vectorizers/final_tfidf_vectorizer.pkl", "rb") as f:
            self.tfidf_vectorizer = pickle.load(f)

        # Load Chi2 Selector
        with open("vectorizers/final_chi2_selector.pkl", "rb") as f:
            self.chi2_selector = pickle.load(f)

        # Load BoW Vectorizer
        with open("vectorizers/final_bow_vectorizer.pkl", "rb") as f:
            self.bow_vectorizer = pickle.load(f)

        # Load InfoGain Selector
        with open("vectorizers/final_infogain_selector.pkl", "rb") as f:
            self.infogain_selector = pickle.load(f)

        # Load GloVe Embeddings (correct way)
        self.glove_embeddings = self.load_glove_embeddings(
            "dataset_and_corpora/glove.6B.100d.txt"
        )

        # Load PCA model for GloVe
        with open("vectorizers/final_pca_model.pkl", "rb") as f:
            self.pca_model = pickle.load(f)

    def load_glove_embeddings(self, glove_file_path):
        """Load GloVe embeddings from a text file into a dictionary."""
        print("[INFO] Loading GloVe embeddings...")
        embeddings = {}
        with open(glove_file_path, "r", encoding="utf-8") as f:
            for line in f:
                values = line.strip().split()
                word = values[0]
                vector = np.asarray(values[1:], dtype=np.float32)
                embeddings[word] = vector
        print(f"[INFO] Loaded {len(embeddings)} word vectors from GloVe.")
        return embeddings

    def transform_tfidf(self, df):
        """Transform data using pre-fitted TF-IDF vectorizer."""
        return self.tfidf_vectorizer.transform(df["content"])

    def transform_bow(self, df, max_features=5000):
        """Transform data using pre-fitted BoW vectorizer."""
        bow_matrix = self.bow_vectorizer.transform(df["content"])
        return bow_matrix.toarray()[:, :max_features]

    def transform_chi2(self, tfidf_features):
        """Transform data using Chi2 selector."""
        return self.chi2_selector.transform(tfidf_features)

    def transform_infogain(self, tfidf_features):
        """Transform data using InfoGain selector."""
        return self.infogain_selector.transform(tfidf_features)

    def transform_glove(self, df, embedding_dim=100):
        """Extract GloVe sentence embeddings."""
        sentences = [content.split() for content in df["content"]]
        glove_features = np.array(
            [
                np.mean(
                    [
                        self.glove_embeddings.get(word, np.zeros(embedding_dim))
                        for word in sentence
                    ]
                    or [np.zeros(embedding_dim)],
                    axis=0,
                )
                for sentence in sentences
            ]
        )
        return glove_features

    def transform_pca_on_glove(self, glove_features):
        """Reduce GloVe embeddings using pre-fitted PCA."""
        return self.pca_model.transform(glove_features)

    def extract_all_features(self, df):
        """Main function to transform test data for all models."""
        tfidf_features = self.transform_tfidf(df)

        features = {
            "bow": self.transform_bow(df),
            "chi2": self.transform_chi2(tfidf_features),
            "infogain": self.transform_infogain(tfidf_features),
        }

        glove_features = self.transform_glove(df)
        glove_pca_features = self.transform_pca_on_glove(glove_features)
        features["glove_pca"] = glove_pca_features

        return features


In [None]:
test_df = pd.read_csv("dataset_and_corpora/cleaned_test.csv")

feature_extractor = TestFeatureExtractor()

test_labels = pd.read_csv("dataset_and_corpora/cleaned_test.csv")["label"].to_numpy()

features = feature_extractor.extract_all_features(test_df)


## Loading Models an Stroing Features

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


tfidf_infogain_lr_model = torch.load(
    "models/lr_tfidf_infogain_model.pth", map_location=device, weights_only=False
)
tfidf_infogain_lr_model.eval()

tfidf_bow_lr_model = torch.load(
    "models/lr_tfidf_bow_model.pth", map_location=device, weights_only=False
)
tfidf_bow_lr_model.eval()

tfidf_chi2_lr_model = torch.load(
    "models/lr_tfidf_chi2_model.pth", map_location=device, weights_only=False
)
tfidf_chi2_lr_model.eval()

glove_pca_lr_model = torch.load(
    "models/lr_glove_pca_model.pth", map_location=device, weights_only=False
)
glove_pca_lr_model.eval()


## Individual Model Evaluation Function

In [None]:
def evaluate_model(name, y_true, y_pred):
    """Evaluate the performance of a model."""
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    matrix = confusion_matrix(y_true, y_pred)

    print(f"\n=== {name} Evaluation ===")

    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", matrix)

    # Save textual metrics
    with open(f"evaluations/{name}_metrics.txt", "w") as f:
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write(f"F1 Score: {f1:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(str(report))
        f.write("\nConfusion Matrix:\n")
        f.write(np.array2string(matrix))

    print(f"[INFO] Saved metrics to {name}_metrics.txt\n")

    # Plot and save confusion matrix with percentages
    cm_percent = matrix.astype("float") / matrix.sum(axis=1)[:, np.newaxis] * 100
    annot = np.array([[f"{val:.2f}%" for val in row] for row in cm_percent])

    plt.figure(figsize=(6, 5))
    ax = sns.heatmap(
        cm_percent,
        annot=annot,
        fmt="",
        cmap="Blues",
        cbar=True,
        xticklabels=["Pred Fake", "Pred True"],
        yticklabels=["Actual Fake", "Actual True"],
    )
    ax.xaxis.set_ticks_position("top")
    ax.xaxis.set_label_position("top")
    plt.title(f"{name}", pad=40)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.tight_layout()
    plt.savefig(f"assets/confusion_matrix_{name}.png")
    plt.show()


In [None]:
y_true = test_labels
# === TF-IDF+Information Gain ===
X_infogain = features["infogain"]
X_infogain_tensor = torch.tensor(X_infogain.todense(), dtype=torch.float32).to(device)
lr_probs = tfidf_infogain_lr_model(X_infogain_tensor).detach().cpu().numpy()[:, 1]
y_pred_lr = (lr_probs >= 0.5).astype(int)
evaluate_model("Logistic Regression with TF-IDF+Information Gain", y_true, y_pred_lr)
del X_infogain, X_infogain_tensor, y_pred_lr, lr_probs

# === TF-IDF+Chi2 ===
X_chi2 = features["chi2"]
X_chi2_tensor = torch.tensor(X_chi2.todense(), dtype=torch.float32).to(device)
lr_probs = tfidf_chi2_lr_model(X_chi2_tensor).detach().cpu().numpy()[:, 1]
y_pred_lr = (lr_probs >= 0.5).astype(int)
evaluate_model("Logistic Regression with TF-IDF+Chi2", y_true, y_pred_lr)
del X_chi2, X_chi2_tensor, y_pred_lr, lr_probs

# === TF-IDF+BoW ===
X_bow = features["bow"]
X_bow_tensor = torch.tensor(X_bow, dtype=torch.float32).to(device)
lr_probs = tfidf_bow_lr_model(X_bow_tensor).detach().cpu().numpy()[:, 1]
y_pred_lr = (lr_probs >= 0.5).astype(int)
evaluate_model("Logistic Regression with TF-IDF+BoW", y_true, y_pred_lr)
del X_bow, X_bow_tensor, y_pred_lr, lr_probs

# === GloVe+PCA ===
X_glove_pca = features["glove_pca"]
X_glove_pca_tensor = torch.tensor(X_glove_pca, dtype=torch.float32).to(device)
lr_probs = glove_pca_lr_model(X_glove_pca_tensor).detach().cpu().numpy()[:, 1]
y_pred_lr = (lr_probs >= 0.5).astype(int)
evaluate_model("Logistic Regression with GloVe+PCA", y_true, y_pred_lr)
del X_glove_pca, X_glove_pca_tensor, y_pred_lr, lr_probs
