# 03 - Week 3: XGBoost + SHAP

Steps:
1. Load processed data.
2. Train/test split with stratification.
3. Optional hyperparameter search.
4. Train XGBoost with class imbalance handling.
5. Evaluate and compare against baseline.
6. Interpret using SHAP.

In [None]:
import json
import joblib
import numpy as np
import pandas as pd
import shap
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix, roc_curve,
    precision_recall_curve
)
from xgboost import XGBClassifier

RANDOM_STATE = 42
DATA_PATH = Path('data/processed/hits_dataset.csv')
MODEL_PATH = Path('models/final_xgboost.pkl')
SKIP_TUNING = True


In [None]:
df = pd.read_csv(DATA_PATH)
feature_cols = [c for c in df.columns if c not in ['is_hit', 'name', 'artist', 'id', 'release_date']]
X = df[feature_cols]
y = df['is_hit']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / pos if pos else 1.0
print(f"scale_pos_weight: {scale_pos_weight:.2f}")


## 1. Hyperparameter search (optional)
Set `SKIP_TUNING=False` for a broader search; defaults run quickly.

In [None]:
base_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

if not SKIP_TUNING:
    param_dist = {
        'max_depth': [4, 6, 8],
        'min_child_weight': [1, 5, 10],
        'subsample': [0.7, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.9, 1.0],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [200, 400, 600]
    }
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=15,
        scoring='average_precision',
        cv=3,
        random_state=RANDOM_STATE,
        verbose=1,
        n_jobs=-1
    )
    search.fit(X_train, y_train)
    model = search.best_estimator_
    print(f"Best params: {search.best_params_}")
else:
    model = base_model


## 2. Train with early stopping

In [None]:
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='aucpr',
    verbose=False,
)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]


## 3. Evaluation

In [None]:
metrics_xgb = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, zero_division=0),
    'recall': recall_score(y_test, y_pred, zero_division=0),
    'f1': f1_score(y_test, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_proba),
    'pr_auc': average_precision_score(y_test, y_proba)
}
print(json.dumps(metrics_xgb, indent=2))


In [None]:
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('XGBoost Confusion Matrix')
plt.tight_layout()
plt.savefig('figures/xgboost_confusion_matrix.png', dpi=300)
plt.close()

fpr, tpr, _ = roc_curve(y_test, y_proba)
precision, recall, _ = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ROC-AUC: {metrics_xgb['roc_auc']:.3f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBoost ROC Curve')
plt.legend()
plt.tight_layout()
plt.savefig('figures/xgboost_roc_curve.png', dpi=300)
plt.close()

plt.figure(figsize=(6,5))
plt.plot(recall, precision, label=f"PR-AUC: {metrics_xgb['pr_auc']:.3f}")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('XGBoost Precision-Recall Curve')
plt.legend()
plt.tight_layout()
plt.savefig('figures/xgboost_pr_curve.png', dpi=300)
plt.close()


## 4. SHAP interpretability

In [None]:
explainer = shap.TreeExplainer(model)
sample_size = min(1000, len(X_train))
shap_background = X_train.sample(sample_size, random_state=RANDOM_STATE)
shap_values = explainer.shap_values(shap_background)

shap.summary_plot(shap_values, shap_background, show=False)
plt.tight_layout()
plt.savefig('figures/shap_summary_detailed.png', dpi=300)
plt.close()

shap.plots.bar(shap_values, max_display=15, show=False)
plt.tight_layout()
plt.savefig('figures/shap_feature_importance.png', dpi=300)
plt.close()


## 5. Persist model

In [None]:
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, MODEL_PATH)
print(f"Saved XGBoost model to {MODEL_PATH}")
