<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-12/day12_ensembles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# day12/day12_ensembles.py
# Day 12: Bagging, Boosting, Voting - Titanic
# Run from project root: python day12/day12_ensembles.py

import warnings, json
warnings.filterwarnings("ignore")

from pathlib import Path
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# optional xgboost
try:
    from xgboost import XGBClassifier
    XGB_OK = True
except Exception:
    XGB_OK = False

OUT = Path("day12"); OUT.mkdir(exist_ok=True)
ASSETS = OUT / "assets"; ASSETS.mkdir(exist_ok=True)

# --------------------------
# 0) Load best processed dataset (prefer Day10 output)
# --------------------------
candidates = [
    Path("day10/day10_titanic_feat.csv"),
    Path("day05/day05_titanic_feat.csv"),
    Path("day02/day02_titanic_clean.csv"),
    Path("train.csv")
]
df = None
for p in candidates:
    if p.exists():
        print("Loading:", p)
        df = pd.read_csv(p)
        break
if df is None:
    raise FileNotFoundError("No input CSV found. Put processed file in day10/day10_titanic_feat.csv or similar.")

if 'Survived' not in df.columns:
    raise KeyError("Loaded dataset must include 'Survived' column.")

# --------------------------
# 1) Safe preprocessing (minimal)
# --------------------------
data = df.copy()
# drop meta columns if present
for c in ['PassengerId','Ticket','Cabin','Name']:
    if c in data.columns: data.drop(columns=[c], inplace=True)

# encode sex string -> binary if needed
if 'Sex' in data.columns and data['Sex'].dtype == object:
    data['Sex'] = (data['Sex'].str.lower().str.startswith('m')).astype(int)

# numeric fill
for col in ['Age','Fare']:
    if col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
        data[col].fillna(data[col].median(), inplace=True)

# one-hot encode any remaining object columns (safe)
cat_cols = data.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [c for c in cat_cols if c != 'Survived']
if cat_cols:
    data = pd.get_dummies(data, columns=cat_cols, drop_first=True)

X = data.drop(columns=['Survived'])
y = data['Survived']

# train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)
print("Shapes ->", X_train.shape, X_test.shape)

# helper: evaluate & save plots
def evaluate(name, model, X_test, y_test, prefix):
    y_pred = model.predict(X_test)
    prob = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None
    metrics = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision": float(precision_score(y_test, y_pred)),
        "recall": float(recall_score(y_test, y_pred)),
        "f1": float(f1_score(y_test, y_pred)),
        "roc_auc": float(roc_auc_score(y_test, prob)) if prob is not None else None
    }
    print(f"\n== {name} ==\n", json.dumps(metrics, indent=2))
    print(classification_report(y_test, y_pred))

    # confusion
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3)); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues'); plt.title(f"{name} Confusion"); plt.xlabel("Pred"); plt.ylabel("Actual"); plt.tight_layout()
    plt.savefig(ASSETS / f"{prefix}_confusion.png"); plt.close()

    # ROC
    if prob is not None:
        fpr,tpr,_ = roc_curve(y_test, prob)
        plt.figure(figsize=(5,4)); plt.plot(fpr,tpr,label=f"AUC={metrics['roc_auc']:.3f}"); plt.plot([0,1],[0,1],'--',color='gray'); plt.title(f"{name} ROC"); plt.legend(); plt.tight_layout()
        plt.savefig(ASSETS / f"{prefix}_roc.png"); plt.close()

    return metrics

results = {}

# --------------------------
# 2) Bagging (BaggingClassifier with DecisionTree)
# --------------------------
print("\nTraining BaggingClassifier (50 trees)...")
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=None),
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)
bag.fit(X_train, y_train)
joblib.dump(bag, ASSETS / "bagging.joblib")
results['bagging'] = evaluate("Bagging", bag, X_test, y_test, "bagging")

# --------------------------
# 3) AdaBoost (boosting)
# --------------------------
print("\nTraining AdaBoost (50 estimators, shallow trees)...")
ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)
ada.fit(X_train, y_train)
joblib.dump(ada, ASSETS / "adaboost.joblib")
results['adaboost'] = evaluate("AdaBoost", ada, X_test, y_test, "adaboost")

# Small GridSearch for AdaBoost (tiny grid, Day 5 style)
print("\nRunning small GridSearchCV for AdaBoost (quick)...")
param_grid = {
    "n_estimators": [50, 100],
    "learning_rate": [0.5, 1.0]
}
gs = GridSearchCV(AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), random_state=42), param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)
pd.DataFrame(gs.cv_results_).to_csv(OUT / "day12_adaboost_gridsearch.csv", index=False)
print("AdaBoost GridSearch saved to day12/day12_adaboost_gridsearch.csv")
best_ada = gs.best_estimator_
joblib.dump(best_ada, ASSETS / "adaboost_tuned.joblib")
results['adaboost_grid'] = {"best_params": gs.best_params_, "cv_best_score": float(gs.best_score_)}
results['adaboost_grid']['test_metrics'] = evaluate("AdaBoost (tuned)", best_ada, X_test, y_test, "adaboost_tuned")

# --------------------------
# 4) GradientBoosting (sklearn)
# --------------------------
print("\nTraining GradientBoostingClassifier (sklearn)...")
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)
joblib.dump(gb, ASSETS / "gradient_boosting.joblib")
results['gradient_boosting'] = evaluate("GradientBoosting", gb, X_test, y_test, "gb")

# --------------------------
# 5) XGBoost (if available)
# --------------------------
if XGB_OK:
    print("\nTraining XGBoost (light config)...")
    xgb = XGBClassifier(n_estimators=150, learning_rate=0.05, max_depth=4, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
    xgb.fit(X_train, y_train)
    joblib.dump(xgb, ASSETS / "xgboost.joblib")
    results['xgboost'] = evaluate("XGBoost", xgb, X_test, y_test, "xgb")
else:
    print("\nXGBoost not installed — skipping XGBoost step. To enable, pip install xgboost")

# --------------------------
# 6) Voting Classifier (soft voting)
# --------------------------
print("\nTraining VotingClassifier (LR + RF + AdaBoost/XGB if available)...")
estimators = [
    ('lr', LogisticRegression(max_iter=2000)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
]
if XGB_OK:
    estimators.append(('xgb', XGBClassifier(n_estimators=100, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)))
# use tuned ada if available
if 'best_ada' in locals():
    estimators.append(('ada', best_ada))
else:
    estimators.append(('ada', AdaBoostClassifier(n_estimators=50, estimator=DecisionTreeClassifier(max_depth=1), random_state=42)))

voting = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
voting.fit(X_train, y_train)
joblib.dump(voting, ASSETS / "voting.joblib")
results['voting'] = evaluate("VotingClassifier", voting, X_test, y_test, "voting")

# --------------------------
# 7) Compare & save summary table
# --------------------------
# Build a simple metrics table
rows = []
for k, v in results.items():
    if isinstance(v, dict) and 'accuracy' in v:
        rows.append({"model": k, **v})
    elif isinstance(v, dict) and 'test_metrics' in v:
        # for gridsearch case
        tm = v['test_metrics']
        rows.append({"model": k, **tm})
df_res = pd.DataFrame(rows)
df_res.to_csv(OUT / "day12_results.csv", index=False)
with open(OUT / "day12_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("\nSaved summary to day12/day12_results.csv and day12/day12_results.json")
print("Assets saved to", ASSETS)

# --------------------------
# 8) Quick observations file
# --------------------------
notes = [
    "Day 12: Bagging, Boosting, Voting experiments",
    f"Top models by f1 (check day12_results.csv):\n{df_res.sort_values('f1', ascending=False).to_string(index=False)}",
    "Next steps: try stacking, larger RandomizedSearchCV on best booster, use SHAP for explanation."
]
with open(OUT / "day12_notes.md", "w") as f:
    f.write("\n".join(notes))

print("\nDay 12 complete ✅")

Loading: day10/day10_titanic_feat.csv
Shapes -> (712, 20) (179, 20)

Training BaggingClassifier (50 trees)...

== Bagging ==
 {
  "accuracy": 0.8100558659217877,
  "precision": 0.7868852459016393,
  "recall": 0.6956521739130435,
  "f1": 0.7384615384615385,
  "roc_auc": 0.820882740447958
}
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       110
           1       0.79      0.70      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.81      0.81      0.81       179


Training AdaBoost (50 estimators, shallow trees)...

== AdaBoost ==
 {
  "accuracy": 0.8044692737430168,
  "precision": 0.7297297297297297,
  "recall": 0.782608695652174,
  "f1": 0.7552447552447552,
  "roc_auc": 0.82733860342556
}
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       110
           1       0.73      0.78      0.7