In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the preprocessed data
train_df = pd.read_csv('../../data/processed/train.csv')
test_df = pd.read_csv('../../data/processed/test.csv')

# Handle potential NaN values in 'cleaned_message' that might arise from preprocessing
train_df['cleaned_message'] = train_df['cleaned_message'].fillna('')
test_df['cleaned_message'] = test_df['cleaned_message'].fillna('')


X_train = train_df['cleaned_message']
y_train = train_df['label']
X_test = test_df['cleaned_message']
y_test = test_df['label']

print("Data loaded successfully.")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Data loaded successfully.
Training samples: 4457
Testing samples: 1115


In [2]:
# --- Researcher's Justification ---
# TF-IDF is chosen for its effectiveness and interpretability in text classification.
# We initialize the vectorizer and `fit` it ONLY on the training data.
# This prevents data leakage from the test set, a fundamental principle of ML evaluation.
# `max_features` is set to prevent an overly large vocabulary, which helps with
# model generalization and reduces computational load.
# ------------------------------------
vectorizer = TfidfVectorizer(max_features=3000)

# Fit on training data and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# Only transform the test data using the already-fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF vectorization complete.")
print(f"Shape of TF-IDF training matrix: {X_train_tfidf.shape}")

TF-IDF vectorization complete.
Shape of TF-IDF training matrix: (4457, 3000)


In [3]:
# --- Researcher's Justification ---
# Logistic Regression is a great baseline: it's lightweight, fast, and highly interpretable.
# `class_weight='balanced'` is used to counteract the class imbalance. It automatically
# adjusts weights inversely proportional to class frequencies in the input data.
# This makes the model pay more attention to the minority class (spam).
# `random_state` ensures reproducibility of results.
# ------------------------------------
model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
model.fit(X_train_tfidf, y_train)

print("Centralized Logistic Regression model trained.")

Centralized Logistic Regression model trained.


In [4]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# --- Researcher's Justification ---
# We compute a full suite of metrics.
# - Accuracy: Overall correctness. Can be misleading with imbalanced data.
# - Precision (for spam=1): Of all messages predicted as spam, how many were actually spam? (Measures false positives)
# - Recall (for spam=1): Of all actual spam messages, how many did we catch? (Measures false negatives)
# - F1-Score: The harmonic mean of precision and recall, providing a single, robust metric for imbalanced classes.
# The classification report provides a clean summary of these key metrics.
# ------------------------------------
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("--- Centralized Baseline Performance ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Ham (0)', 'Spam (1)']))

--- Centralized Baseline Performance ---
Accuracy: 0.9812
Precision: 0.9384
Recall: 0.9195
F1-Score: 0.9288

--- Classification Report ---
              precision    recall  f1-score   support

     Ham (0)       0.99      0.99      0.99       966
    Spam (1)       0.94      0.92      0.93       149

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

