# 02 - Week 2: Baseline Modeling (Logistic Regression)

Steps:
1. Load processed dataset.
2. Train/test split with stratification.
3. Dummy classifier baseline.
4. Logistic Regression with class balancing.
5. Evaluate using accuracy, precision, recall, F1, ROC-AUC, and PR-AUC.
6. Interpret coefficients.

In [None]:
import json
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix, roc_curve,
    precision_recall_curve
)

RANDOM_STATE = 42
DATA_PATH = Path('data/processed/hits_dataset.csv')
MODEL_PATH = Path('models/baseline_logreg.pkl')


In [None]:
df = pd.read_csv(DATA_PATH)
feature_cols = [c for c in df.columns if c not in ['is_hit', 'name', 'artist', 'id', 'release_date']]
X = df[feature_cols]
y = df['is_hit']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


## 1. Dummy classifier baseline
Predicts the majority class; provides a sanity-check metric.

In [None]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
p_dummy = dummy.predict(X_test)
metrics_dummy = {
    'accuracy': accuracy_score(y_test, p_dummy),
    'precision': precision_score(y_test, p_dummy, zero_division=0),
    'recall': recall_score(y_test, p_dummy, zero_division=0),
    'f1': f1_score(y_test, p_dummy, zero_division=0)
}
print(json.dumps(metrics_dummy, indent=2))


## 2. Logistic Regression with class weights
`class_weight='balanced'` compensates for hit scarcity.

In [None]:
logreg = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        solver='lbfgs',
        n_jobs=-1,
        random_state=RANDOM_STATE
    ))
])

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)[:, 1]


## 3. Evaluation

In [None]:
metrics_logreg = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, zero_division=0),
    'recall': recall_score(y_test, y_pred, zero_division=0),
    'f1': f1_score(y_test, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_proba),
    'pr_auc': average_precision_score(y_test, y_proba)
}
print(json.dumps(metrics_logreg, indent=2))


In [None]:
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Logistic Regression Confusion Matrix')
plt.tight_layout()
plt.savefig('figures/logreg_confusion_matrix.png', dpi=300)
plt.close()

fpr, tpr, _ = roc_curve(y_test, y_proba)
precision, recall, _ = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ROC-AUC: {metrics_logreg['roc_auc']:.3f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.tight_layout()
plt.savefig('figures/logreg_roc_curve.png', dpi=300)
plt.close()

plt.figure(figsize=(6,5))
plt.plot(recall, precision, label=f"PR-AUC: {metrics_logreg['pr_auc']:.3f}")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.tight_layout()
plt.savefig('figures/logreg_pr_curve.png', dpi=300)
plt.close()


## 4. Coefficient interpretation

In [None]:
model = logreg.named_steps['model']
scaler = logreg.named_steps['scaler']
coef = model.coef_[0]
scaled_coef = coef / scaler.scale_
coef_df = pd.DataFrame({'feature': feature_cols, 'coefficient': scaled_coef}).sort_values('coefficient', ascending=False)

plt.figure(figsize=(8,6))
sns.barplot(data=coef_df, x='coefficient', y='feature', palette='coolwarm')
plt.title('Logistic Regression Coefficients')
plt.tight_layout()
plt.savefig('figures/logreg_coefficients.png', dpi=300)
plt.close()

coef_df.head()


## 5. Persist model

In [None]:
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(logreg, MODEL_PATH)
print(f"Saved baseline model to {MODEL_PATH}")


Move on to `03_Week3_XGBoost_SHAP.ipynb`.