In [None]:
# ================================
# KAGGLE DATASET PIPELINE (LOGISTIC REGRESSION)
# ================================
# Install required packages (if not already installed)
!pip install pandas numpy scikit-learn openpyxl imbalanced-learn

# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# ================================
# HELPER FUNCTION
# ================================
def preprocess_text(text):
    """
    Lowercases, removes non-word characters, and extra spaces from text.
    """
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

print("==== KAGGLE DATASET: Logistic Regression ====")

# Load Kaggle data from CSV files
try:
    fake_df = pd.read_csv("fake.csv")
    true_df = pd.read_csv("true.csv")
    print("Kaggle CSV data loaded successfully.")
except Exception as e:
    raise ValueError(f"Error loading Kaggle data: {e}")

# Assign labels: fake = 0, true = 1
fake_df['label'] = 0
true_df['label'] = 1

# Combine the two datasets
kaggle_df = pd.concat([fake_df, true_df], ignore_index=True)
print(f"Combined Kaggle data size: {len(kaggle_df)}")

# Preprocess text from the "text" column (ensure the column exists)
if "text" not in kaggle_df.columns:
    raise ValueError("The Kaggle dataset must have a 'text' column.")
kaggle_df["clean_text"] = kaggle_df["text"].apply(preprocess_text)

# Optional: sample a subset if needed
# kaggle_df = kaggle_df.sample(n=35000, random_state=42)

# Train-test split (80%-20% stratified by label)
X_train_kaggle, X_test_kaggle, y_train_kaggle, y_test_kaggle = train_test_split(
    kaggle_df["clean_text"], kaggle_df["label"], test_size=0.2, random_state=42, stratify=kaggle_df["label"]
)
print(f"Kaggle Train size: {len(X_train_kaggle)}, Test size: {len(X_test_kaggle)}")

# TF-IDF Vectorization for Kaggle data
print("Applying TF-IDF vectorization for Kaggle...")
vectorizer_kaggle = TfidfVectorizer(max_features=10000, stop_words="english", ngram_range=(1, 3))
X_train_kaggle_tfidf = vectorizer_kaggle.fit_transform(X_train_kaggle)
X_test_kaggle_tfidf = vectorizer_kaggle.transform(X_test_kaggle)
print("TF-IDF vectorization complete for Kaggle.")

# Apply SMOTE to balance the Kaggle training set
print("Applying SMOTE to balance the Kaggle training set...")
smote = SMOTE(random_state=42)
X_train_kaggle_res, y_train_kaggle_res = smote.fit_resample(X_train_kaggle_tfidf, y_train_kaggle)
print("SMOTE applied. New class distribution:")
print(pd.Series(y_train_kaggle_res).value_counts())

# Train Logistic Regression model for Kaggle
print("Training Logistic Regression model for Kaggle...")
model_kaggle = LogisticRegression(max_iter=200, n_jobs=-1, random_state=42)
model_kaggle.fit(X_train_kaggle_res, y_train_kaggle_res)
print("Model training complete for Kaggle.")

# Make predictions on Kaggle test set
print("Making predictions on the Kaggle test set...")
y_pred_kaggle = model_kaggle.predict(X_test_kaggle_tfidf)

# Evaluate the Kaggle model
accuracy_kaggle = accuracy_score(y_test_kaggle, y_pred_kaggle)
print(f"\nKaggle Model Accuracy: {accuracy_kaggle:.4f}")
print("\nKaggle Classification Report:")
print(classification_report(y_test_kaggle, y_pred_kaggle))
print("Kaggle Confusion Matrix:")
print(confusion_matrix(y_test_kaggle, y_pred_kaggle))

# Optionally, save Kaggle evaluation results to Excel
kaggle_report_df = pd.DataFrame(classification_report(y_test_kaggle, y_pred_kaggle, output_dict=True)).transpose()
kaggle_report_df.to_excel("LR_Kaggle_Classification_Report.xlsx", index=True)

# For saving predictions, create a DataFrame with predictions and true labels
X_test_kaggle_df = X_test_kaggle.to_frame(name="text")
X_test_kaggle_df["predicted_label"] = y_pred_kaggle
X_test_kaggle_df["true_label"] = y_test_kaggle.values
X_test_kaggle_df.to_excel("LR_Kaggle_Predictions.xlsx", index=False)
print("Kaggle evaluation and predictions saved successfully.\n")
