In [None]:
# ================================
# FEVER DATASET PIPELINE (LOGISTIC REGRESSION)
# ================================
# Install required packages (if not already installed)
!pip install pandas numpy scikit-learn openpyxl imbalanced-learn

# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# ================================
# HELPER FUNCTION
# ================================
def preprocess_text(text):
    """
    Lowercases, removes non-word characters, and extra spaces from text.
    """
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

print("==== FEVER DATASET: Logistic Regression ====")

# Load FEVER data (Excel file)
try:
    fever_df = pd.read_excel("../data/filtered_fever_data.xlsx")
    print("FEVER data loaded successfully.")
except Exception as e:
    raise ValueError(f"Error loading FEVER data: {e}")

# Mapping labels (retaining all 3 classes if "NOT ENOUGH INFO" exists)
label_mapping = {"SUPPORTS": 1, "REFUTES": 0, "NOT ENOUGH INFO": 2}
if "label" not in fever_df.columns:
    raise ValueError("The FEVER dataset must have a 'label' column.")
fever_df["label"] = fever_df["label"].str.strip().str.upper()
fever_df["label"] = fever_df["label"].map(label_mapping)

# Preprocess text from the "claim" column
if "claim" not in fever_df.columns:
    raise ValueError("The FEVER dataset must have a 'claim' column.")
fever_df["clean_claim"] = fever_df["claim"].apply(preprocess_text)

# Optional: sample a subset if the dataset is large
# fever_df = fever_df.sample(n=30000, random_state=42)

# Train-test split (e.g., 80%-20% stratified by label)
train_df, test_df = train_test_split(fever_df, test_size=0.2, random_state=42, stratify=fever_df["label"])
print(f"FEVER Train size: {len(train_df)}, Test size: {len(test_df)}")

# TF-IDF Vectorization
print("Applying TF-IDF vectorization for FEVER...")
vectorizer_fever = TfidfVectorizer(max_features=10000, stop_words="english", ngram_range=(1, 3))
X_train_fever = vectorizer_fever.fit_transform(train_df["clean_claim"])
X_test_fever = vectorizer_fever.transform(test_df["clean_claim"])
y_train_fever = train_df["label"]
y_test_fever = test_df["label"]
print("TF-IDF vectorization complete for FEVER.")

# Apply SMOTE to balance the training set
print("Applying SMOTE to balance the FEVER training set...")
smote = SMOTE(random_state=42)
X_train_fever_res, y_train_fever_res = smote.fit_resample(X_train_fever, y_train_fever)
print("SMOTE applied. New class distribution:")
print(pd.Series(y_train_fever_res).value_counts())

# Train Logistic Regression model
print("Training Logistic Regression model for FEVER...")
model_fever = LogisticRegression(max_iter=200, n_jobs=-1, random_state=42)
model_fever.fit(X_train_fever_res, y_train_fever_res)
print("Model training complete for FEVER.")

# Make predictions
print("Making predictions on the FEVER test set...")
y_pred_fever = model_fever.predict(X_test_fever)

# Evaluate the model
accuracy_fever = accuracy_score(y_test_fever, y_pred_fever)
print(f"\nFEVER Model Accuracy: {accuracy_fever:.4f}")
print("\nFEVER Classification Report:")
print(classification_report(y_test_fever, y_pred_fever))
print("FEVER Confusion Matrix:")
print(confusion_matrix(y_test_fever, y_pred_fever))

# Optionally, save FEVER evaluation results to Excel
fever_report_df = pd.DataFrame(classification_report(y_test_fever, y_pred_fever, output_dict=True)).transpose()
fever_report_df.to_excel("LR_FEVER_Classification_Report.xlsx", index=True)
test_df["predicted_label"] = y_pred_fever
test_df.to_excel("LR_FEVER_Predictions.xlsx", index=False)
print("FEVER evaluation and predictions saved successfully.\n")
