In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Set style
sns.set(style="whitegrid")

# Model 1: Logistic Regression (Baseline)

In [None]:
# Load Processed Data
train_df = pd.read_csv("processed_data/train_processed.csv")
test_df = pd.read_csv("processed_data/test_processed.csv")

# Separate Features (X) and Target (y)
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']

X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

print("Training Data Shape:", X_train.shape)

# PCA Step (Requirement)
Determine components to justify at least 95% of the data variance.

In [None]:
# Initialize PCA to keep 95% variance
pca = PCA(n_components=0.95)

# Fit on training data and transform
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Original Features: {X_train.shape[1]}")
print(f"PCA Components for 95% Variance: {X_train_pca.shape[1]}")

# Train Logistic Regression

In [None]:
# Initialize Model
log_reg = LogisticRegression(random_state=42)

# Train on PCA data
log_reg.fit(X_train_pca, y_train)

# Predict
y_pred = log_reg.predict(X_test_pca)

# Evaluation

In [None]:
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Logistic Regression')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()