# Week 2 – Baseline Modeling (Logistic Regression)

This notebook builds a robust baseline using a dummy classifier and logistic regression with class balancing. It reports accuracy, precision, recall, F1, ROC-AUC, PR-AUC, and saves evaluation plots and the trained model.

In [None]:
import json
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (ConfusionMatrixDisplay, average_precision_score,
                             classification_report, confusion_matrix, precision_recall_curve,
                             roc_auc_score, roc_curve)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

DATA_PATH = Path('data/processed/hits_dataset.csv')
FIG_DIR = Path('figures')
MODEL_DIR = Path('models')
FIG_DIR.mkdir(exist_ok=True)
MODEL_DIR.mkdir(exist_ok=True)

assert DATA_PATH.exists(), 'Run Week 1 notebook first to generate hits_dataset.csv'

df = pd.read_csv(DATA_PATH)
feature_candidates = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo'
]
features = [f for f in feature_candidates if f in df.columns]
target = 'is_hit'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Baseline: Dummy classifier

A stratified dummy classifier establishes the performance floor for an imbalanced dataset.

In [None]:
dummy = DummyClassifier(strategy='stratified', random_state=42)
dummy.fit(X_train, y_train)
dummy_probs = dummy.predict_proba(X_test)[:, 1]
dummy_preds = dummy.predict(X_test)

dummy_metrics = {
    'accuracy': dummy.score(X_test, y_test),
    'roc_auc': roc_auc_score(y_test, dummy_probs),
    'pr_auc': average_precision_score(y_test, dummy_probs)
}
print('Dummy metrics:', dummy_metrics)

## Logistic regression with class balancing

We standardize features and enable `class_weight="balanced"` to address severe imbalance.

In [None]:
log_reg = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        max_iter=200,
        class_weight='balanced',
        solver='lbfgs'
    ))
])

log_reg.fit(X_train, y_train)
probs = log_reg.predict_proba(X_test)[:, 1]
preds = log_reg.predict(X_test)

metrics = {
    'accuracy': log_reg.score(X_test, y_test),
    'precision': np.mean(preds[y_test.values == 1]) if (preds == 1).sum() else 0.0,
    'recall': np.mean(y_test.values[preds == 1]) if (preds == 1).sum() else 0.0,
    'f1': None,
    'roc_auc': roc_auc_score(y_test, probs),
    'pr_auc': average_precision_score(y_test, probs)
}
precision_vals, recall_vals, _ = precision_recall_curve(y_test, probs)
f1_scores = 2 * (precision_vals * recall_vals) / (precision_vals + recall_vals + 1e-9)
metrics['f1'] = float(np.max(f1_scores))

print('Logistic regression metrics:', metrics)
print('
Classification report:
', classification_report(y_test, preds, digits=3))

## Visual evaluation

Confusion matrix, ROC curve, and precision-recall curve provide a holistic view of performance on the minority class.

In [None]:
cm = confusion_matrix(y_test, preds, labels=[0, 1])
disp = ConfusionMatrixDisplay(cm, display_labels=['Non-hit', 'Hit'])
disp.plot(values_format='d')
plt.title('Logistic Regression Confusion Matrix')
plt.tight_layout()
plt.savefig(FIG_DIR / 'logreg_confusion_matrix.png', dpi=300)
plt.close()

fpr, tpr, _ = roc_curve(y_test, probs)
fig, ax = plt.subplots(figsize=(5, 4))
ax.plot(fpr, tpr, label=f"ROC AUC = {metrics['roc_auc']:.3f}")
ax.plot([0, 1], [0, 1], '--', color='gray')
ax.set_title('ROC Curve')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend()
fig.tight_layout()
fig.savefig(FIG_DIR / 'logreg_roc.png', dpi=300)
plt.close(fig)

precision_vals, recall_vals, _ = precision_recall_curve(y_test, probs)
fig, ax = plt.subplots(figsize=(5, 4))
ax.plot(recall_vals, precision_vals, label=f"PR AUC = {metrics['pr_auc']:.3f}")
ax.set_title('Precision-Recall Curve')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.legend()
fig.tight_layout()
fig.savefig(FIG_DIR / 'logreg_pr.png', dpi=300)
plt.close(fig)

## Coefficient analysis and model saving

Examine feature coefficients for interpretability and persist the trained baseline for comparison against XGBoost.

In [None]:
coef = log_reg.named_steps['model'].coef_[0]
coef_df = pd.DataFrame({'feature': features, 'coefficient': coef}).sort_values(by='coefficient', ascending=False)
print(coef_df)

fig, ax = plt.subplots(figsize=(6, 4))
sns.barplot(data=coef_df, x='coefficient', y='feature', ax=ax, palette='vlag')
ax.set_title('Logistic Regression Coefficients')
fig.tight_layout()
fig.savefig(FIG_DIR / 'logreg_coefficients.png', dpi=300)
plt.close(fig)

MODEL_DIR.mkdir(exist_ok=True)
model_path = MODEL_DIR / 'baseline_logreg.pkl'
joblib.dump(log_reg, model_path)
print(f'Saved baseline model to {model_path}')

with open(MODEL_DIR / 'baseline_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print('Stored baseline metrics for downstream comparison.')