<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-9/day09_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# day09_xgboost.py
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import json
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
import joblib

# xgboost import
from xgboost import XGBClassifier
# optional: lightgbm
try:
    import lightgbm as lgb
    LIGHTGBM = True
except Exception:
    LIGHTGBM = False

OUT = Path("day09"); OUT.mkdir(exist_ok=True)
ASSETS = OUT / "assets"; ASSETS.mkdir(exist_ok=True)

# -------------------------
# 0) Load data (processed if available)
# -------------------------
candidates = [Path("day05/day05_titanic_feat.csv"), Path("day05_titanic_feat.csv"),
              Path("day02/day02_titanic_clean.csv"), Path("train.csv")]
data = None
for p in candidates:
    if p.exists():
        print("Loading:", p)
        data = pd.read_csv(p)
        break
if data is None:
    raise FileNotFoundError("No input CSV found. Put day05/day05_titanic_feat.csv or day02/day02_titanic_clean.csv in the project.")

if 'Survived' not in data.columns:
    raise KeyError("'Survived' column not found in loaded data.")

# -------------------------
# 1) Safe preprocessing (minimal)
# -------------------------
df = data.copy()
# drop useless columns
for c in ['PassengerId','Ticket','Cabin','Name']:
    if c in df.columns: df.drop(columns=[c], inplace=True)

# encode Sex string if present
if 'Sex' in df.columns and df['Sex'].dtype == object:
    df['Sex'] = (df['Sex'].str.lower().str.startswith('m')).astype(int)

# fill numeric missings
for c in ['Age','Fare']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[c].fillna(df[c].median(), inplace=True)

# one-hot encode object columns
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [c for c in cat_cols if c != 'Survived']
if cat_cols:
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# split
X = df.drop(columns=['Survived'])
y = df['Survived']
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

# Make a small validation split from training for early stopping
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.15, stratify=y_train_full, random_state=42)

print("Shapes -> train:", X_train.shape, "val:", X_val.shape, "test:", X_test.shape)

# -------------------------
# 2) Baseline XGBoost with early stopping
# -------------------------
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

xgb.fit(
    X_train, y_train
)

# predict & evaluate on test
def metrics_report(model, Xt, yt):
    yp = model.predict(Xt)
    yp_proba = model.predict_proba(Xt)[:,1] if hasattr(model, "predict_proba") else None
    metrics = {
        "accuracy": float(accuracy_score(yt, yp)),
        "precision": float(precision_score(yt, yp)),
        "recall": float(recall_score(yt, yp)),
        "f1": float(f1_score(yt, yp)),
        "roc_auc": float(roc_auc_score(yt, yp_proba)) if yp_proba is not None else None
    }
    print(json.dumps(metrics, indent=2))
    print(classification_report(yt, yp))
    return metrics, yp, yp_proba

print("\n--- XGBoost baseline test metrics ---")
xgb_metrics, xgb_y_pred, xgb_y_proba = metrics_report(xgb, X_test, y_test)

# save model and feature importances plot
joblib.dump(xgb, ASSETS/"xgb_baseline.joblib")
fi = pd.Series(xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(6,8)); sns.barplot(x=fi.values[:20], y=fi.index[:20]); plt.title("XGB Feature Importances (top20)"); plt.tight_layout(); plt.savefig(ASSETS/"xgb_feature_importances.png"); plt.close()

# ROC curve plot
if xgb_y_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, xgb_y_proba)
    plt.figure(figsize=(5,4)); plt.plot(fpr,tpr, label=f"AUC={xgb_metrics['roc_auc']:.3f}"); plt.plot([0,1],[0,1],'--', color='gray'); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("XGB ROC"); plt.legend(); plt.tight_layout(); plt.savefig(ASSETS/"xgb_roc.png"); plt.close()

# -------------------------
# 3) Quick (light) RandomizedSearch for XGBoost (optional, may take time)
# -------------------------
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'n_estimators':[100,200,300],
    'max_depth':[3,4,5,6],
    'learning_rate':[0.01,0.03,0.05,0.1],
    'subsample':[0.6,0.8,1.0],
    'colsample_bytree':[0.6,0.8,1.0],
    'reg_alpha':[0,0.1,1],
    'reg_lambda':[1,2,5]
}
rs = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
    param_distributions=param_dist,
    n_iter=12, cv=3, scoring='f1', verbose=1, n_jobs=-1, random_state=42
)
print("Running RandomizedSearchCV (this may take several minutes)...")
rs.fit(X_train_full, y_train_full)  # note: uses full train to speed validation inside CV
print("Best params:", rs.best_params_)
best_xgb = rs.best_estimator_
joblib.dump(best_xgb, ASSETS/"xgb_random_search_best.joblib")

# evaluate tuned on test
print("\n--- XGBoost tuned test metrics ---")
tuned_metrics, tuned_y_pred, tuned_y_proba = metrics_report(best_xgb, X_test, y_test)

# -------------------------
# 4) Try LightGBM (if installed)
# -------------------------
if LIGHTGBM:
    print("\nTrying LightGBM...")
    lgbm = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42, n_jobs=-1)
    lgbm.fit(X_train, y_train)
    joblib.dump(lgbm, ASSETS/"lgbm_baseline.joblib")
    print("\n--- LightGBM test metrics ---")
    lgbm_metrics, _, _ = metrics_report(lgbm, X_test, y_test)

# -------------------------
# 5) SHAP explanation (optional, slow)
# -------------------------
try:
    import shap
    explainer = shap.TreeExplainer(best_xgb if 'best_xgb' in locals() else xgb)
    sample = X_test.sample(n=min(200, X_test.shape[0]), random_state=42)
    shap_values = explainer.shap_values(sample)
    shap.summary_plot(shap_values, sample, show=False)
    plt.tight_layout(); plt.savefig(ASSETS/"shap_summary.png"); plt.close()
    print("SHAP summary saved")
except Exception as e:
    print("SHAP not run (missing package or error):", e)

# -------------------------
# 6) Save results summary
# -------------------------
results = {
    "xgb_baseline": xgb_metrics,
    "xgb_tuned": tuned_metrics if 'tuned_metrics' in locals() else None,
    "lightgbm": lgbm_metrics if 'lgbm_metrics' in locals() else None
}
with open(OUT/"day09_results.json","w") as f:
    json.dump(results, f, indent=2)

print("Saved results & assets in", OUT)

Loading: train.csv
Shapes -> train: (643, 8) val: (114, 8) test: (134, 8)

--- XGBoost baseline test metrics ---
{
  "accuracy": 0.7611940298507462,
  "precision": 0.7209302325581395,
  "recall": 0.6078431372549019,
  "f1": 0.6595744680851063,
  "roc_auc": 0.7767540751240256
}
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        83
           1       0.72      0.61      0.66        51

    accuracy                           0.76       134
   macro avg       0.75      0.73      0.74       134
weighted avg       0.76      0.76      0.76       134

Running RandomizedSearchCV (this may take several minutes)...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best params: {'subsample': 0.8, 'reg_lambda': 5, 'reg_alpha': 1, 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.6}

--- XGBoost tuned test metrics ---
{
  "accuracy": 0.7761194029850746,
  "precision": 0.7333333333333333,
  "recall": 0.64