<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-22/day22_ensemble_blend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
"""
day22_ensemble_blend.py
Day 22 - Quick ensemble blending & model comparison (2-hour session)

What it does:
- Loads processed train/test CSVs (fix bool dtypes)
- Tries to load tuned models (random search pickles / stacking pipeline) from models/
- If not found, trains quick RandomForest & GradientBoosting (+ XGB/LGB if installed)
- For each candidate pipeline: computes out-of-fold (OOF) probabilities with StratifiedKFold
- Searches a small weight grid for best weighted-average blend (2 or 3 models)
- Trains pipelines on full data and writes blended test submission (optional)
- Saves outputs to outputs/ and models/

Designed to be conservative in runtime to fit a 2-hour window.
"""

import os
import glob
import joblib
from pathlib import Path
from datetime import datetime
import itertools
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from packaging import version
import sklearn

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score

# Optional libs
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAS_LGB = True
except Exception:
    HAS_LGB = False

# ----------------- CONFIG -----------------
DATA_DIR = Path("data/processed")
TRAIN_CSV = DATA_DIR / "train_processed.csv"
TEST_CSV = DATA_DIR / "data/processed/test_processed.csv"  # not required; use your actual path
MODELS_DIR = Path("models")
OUT_DIR = Path("outputs")
SUB_DIR = Path("submissions")

for p in [MODELS_DIR, OUT_DIR, SUB_DIR]:
    p.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
CV_FOLDS = 5
OOF_N_JOBS = -1
SAVE_SUBMISSION = True   # set False if you don't want to save submission
MAX_MODELS_TO_BLEND = 3  # try blending top 2-3 models
# ------------------------------------------

def make_ohe():
    """Backward-compatible OneHotEncoder factory"""
    from sklearn.preprocessing import OneHotEncoder
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    return OneHotEncoder(handle_unknown="ignore", sparse=False)

def load_csv_safe(path):
    df = pd.read_csv(path)
    # convert bools -> str early to avoid imputer dtype issues
    for c in df.select_dtypes(include=["bool"]).columns:
        df[c] = df[c].astype(str)
    return df

def build_preprocessor(X_df):
    """Build a ColumnTransformer preprocessor from a sample dataframe X_df"""
    Xs = X_df.copy()
    # convert bools -> str already done
    numeric_features = Xs.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = Xs.select_dtypes(include=["object", "category"]).columns.tolist()
    # remove PassengerId if present
    for c in ("PassengerId",):
        if c in numeric_features: numeric_features.remove(c)
        if c in categorical_features: categorical_features.remove(c)

    num_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
        ("ohe", make_ohe())
    ])
    preprocessor = ColumnTransformer([
        ("num", num_transformer, numeric_features),
        ("cat", cat_transformer, categorical_features)
    ], remainder="drop")
    return preprocessor

def find_latest_randomsearch(prefix):
    """Find latest randomsearch* or model pickles for prefix (e.g. 'random_forest' or 'xgboost')"""
    pattern = str(MODELS_DIR / f"*{prefix}*randomsearch*.pkl")
    files = sorted(glob.glob(pattern))
    return files[-1] if files else None

def find_latest_stacking():
    files = sorted(glob.glob(str(MODELS_DIR / "*stacking*pipeline*.pkl")) + glob.glob(str(MODELS_DIR / "*stack*pipeline*.pkl")))
    return files[-1] if files else None

def load_tuned_estimator(file_path):
    """Load a RandomizedSearchCV or estimator pickle and return an estimator (best_estimator_ if exists)"""
    try:
        obj = joblib.load(file_path)
        if hasattr(obj, "best_estimator_"):
            return obj.best_estimator_
        return obj
    except Exception as e:
        print("Failed to load tuned estimator:", file_path, e)
        return None

def prepare_candidate_pipelines(preprocessor, X_df):
    """Return dict{name: pipeline(estimator wrapped with preprocessor)} for candidates.
       Prefers tuned models if pickles exist, else trains short default models later."""
    candidates = {}
    # Try load tuned RF
    rf_file = find_latest_randomsearch("random_forest")
    if rf_file:
        est = load_tuned_estimator(rf_file)
        if est is not None:
            candidates["rf_tuned"] = Pipeline([("pre", preprocessor), ("clf", est)])
            print("Loaded tuned RandomForest from:", rf_file)

    # Try load tuned xgboost
    if HAS_XGB:
        xgb_file = find_latest_randomsearch("xgboost")
        if xgb_file:
            est = load_tuned_estimator(xgb_file)
            if est is not None:
                candidates["xgb_tuned"] = Pipeline([("pre", preprocessor), ("clf", est)])
                print("Loaded tuned XGBoost from:", xgb_file)

    # Try load tuned lightgbm
    if HAS_LGB:
        lgb_file = find_latest_randomsearch("lightgbm")
        if lgb_file:
            est = load_tuned_estimator(lgb_file)
            if est is not None:
                candidates["lgb_tuned"] = Pipeline([("pre", preprocessor), ("clf", est)])
                print("Loaded tuned LightGBM from:", lgb_file)

    # Try stacked pipeline first
    stacking_file = find_latest_stacking()
    if stacking_file:
        try:
            stack_pipe = joblib.load(stacking_file)
            # If it's a pipeline, keep as candidate
            if isinstance(stack_pipe, Pipeline):
                candidates["stacking_saved"] = stack_pipe
                print("Loaded saved stacking pipeline:", stacking_file)
        except Exception:
            pass

    # If we have few candidates, prepare some default quick models
    # (we'll train them quickly on full data when needed)
    return candidates

def train_quick_models(preprocessor, X, y, names_to_train=("rf_quick","gb_quick","xgb_quick","lgb_quick")):
    """Train quick base models (faster settings). Returns dict name->pipeline"""
    candidates = {}
    if "rf_quick" in names_to_train:
        rf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=RANDOM_STATE, n_jobs=-1)
        candidates["rf_quick"] = Pipeline([("pre", preprocessor), ("clf", rf)])
    if "gb_quick" in names_to_train:
        gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=RANDOM_STATE)
        candidates["gb_quick"] = Pipeline([("pre", preprocessor), ("clf", gb)])
    if HAS_XGB and "xgb_quick" in names_to_train:
        xgb = XGBClassifier(n_estimators=200, learning_rate=0.05, use_label_encoder=False, eval_metric="logloss", random_state=RANDOM_STATE, n_jobs=1)
        candidates["xgb_quick"] = Pipeline([("pre", preprocessor), ("clf", xgb)])
    if HAS_LGB and "lgb_quick" in names_to_train:
        lgb = LGBMClassifier(n_estimators=200, learning_rate=0.05, random_state=RANDOM_STATE, n_jobs=1)
        candidates["lgb_quick"] = Pipeline([("pre", preprocessor), ("clf", lgb)])
    return candidates

def oof_probs_for_pipeline(pipe, X, y, folds=CV_FOLDS):
    """Return out-of-fold probabilities (for class 1) using cross_val_predict with method='predict_proba'"""
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=RANDOM_STATE)
    # cross_val_predict with predict_proba -> returns shape (n_samples, n_classes)
    probs = cross_val_predict(pipe, X, y, cv=cv, method="predict_proba", n_jobs=OOF_N_JOBS)
    # return only class 1
    return probs[:, 1]

def evaluate_oof_blend(oof_probs_list, y_true):
    """Given list of OOF probability arrays (same length) and y_true, compute accuracy & f1 for equal weights"""
    avg = np.mean(np.vstack(oof_probs_list), axis=0)
    preds = (avg >= 0.5).astype(int)
    return accuracy_score(y_true, preds), f1_score(y_true, preds)

# -------------------- MAIN --------------------
def main():
    ts = datetime.now().strftime("%Y%m%d_%H%M")
    prefix = f"day22_{ts}"

    # 0) Load data
    train = load_csv_safe(TRAIN_CSV)
    test_exists = TEST_CSV.exists()
    test = load_csv_safe(TEST_CSV) if test_exists else None

    if "Survived" not in train.columns:
        raise SystemExit("train_processed.csv must contain 'Survived' column")

    X_full = train.drop(columns=["Survived"])
    y_full = train["Survived"].astype(int)

    # 1) Build preprocessor using X_full as sample
    preprocessor = build_preprocessor(X_full)

    # 2) Prepare candidate pipelines (load tuned if possible)
    candidates = prepare_candidate_pipelines(preprocessor, X_full)
    print("Initial candidates loaded from models/: ", list(candidates.keys()))

    # 3) If too few candidates, add quick defaults and train them now
    if len(candidates) < 2:
        quick = train_quick_models(preprocessor, X_full, y_full)
        candidates.update(quick)
        print("Added quick default candidates: ", [k for k in quick.keys()])

    # 4) For each candidate, compute OOF probs (cross-validated)
    oof_dict = {}
    model_scores = []
    for name, pipe in candidates.items():
        print(f"\n-> Processing candidate: {name}")
        # Fit on full data if pipeline contains an estimator with fit requirement? cross_val_predict will fit internally
        try:
            # compute OOF probabilities
            probs = oof_probs_for_pipeline(pipe, X_full, y_full, folds=CV_FOLDS)
            oof_dict[name] = probs
            # compute CV accuracy using pipeline (simple)
            cv_scores = cross_val_score(pipe, X_full, y_full, cv=CV_FOLDS, scoring="accuracy", n_jobs=-1)
            mean_cv = float(np.mean(cv_scores))
            model_scores.append({"name": name, "cv_mean": mean_cv, "cv_std": float(np.std(cv_scores))})
            print(f"  {name} CV mean: {mean_cv:.4f} (+/- {np.std(cv_scores):.4f})")
        except Exception as e:
            print(f"  Failed to compute OOF for {name}: {e}")

    # Save model_scores
    df_scores = pd.DataFrame(model_scores).sort_values("cv_mean", ascending=False)
    out_scores = OUT_DIR / f"{prefix}_model_scores.csv"
    df_scores.to_csv(out_scores, index=False)
    print("\nSaved candidate CV scores to:", out_scores)

    # 5) Choose top models to blend (by cv_mean)
    top_names = df_scores["name"].tolist()[:MAX_MODELS_TO_BLEND]
    print("Top models selected for blending:", top_names)
    oof_list = [oof_dict[n] for n in top_names]

    # 6) Weighted blend search (supports 2 or 3 models)
    print("\nSearching weight grid for best blend...")
    best = {"weights": None, "accuracy": -1, "f1": -1}
    results = []
    if len(oof_list) == 1:
        # nothing to blend, evaluate single model
        acc, f1 = accuracy_score(y_full, (oof_list[0] >= 0.5).astype(int)), f1_score(y_full, (oof_list[0] >= 0.5).astype(int))
        best = {"weights": [1.0], "accuracy": acc, "f1": f1}
        results.append({"w": "1.0", "accuracy": acc, "f1": f1})
    elif len(oof_list) == 2:
        w_range = np.linspace(0, 1, 21)
        for w in w_range:
            blended = w * oof_list[0] + (1 - w) * oof_list[1]
            preds = (blended >= 0.5).astype(int)
            acc = accuracy_score(y_full, preds)
            f1 = f1_score(y_full, preds)
            results.append({"w1": float(w), "w2": float(1 - w), "accuracy": float(acc), "f1": float(f1)})
            if acc > best["accuracy"]:
                best = {"weights": [float(w), float(1 - w)], "accuracy": float(acc), "f1": float(f1)}
    else:
        # 3 models: loop w1,w2, w3 = 1-w1-w2 with coarse grid
        steps = np.linspace(0, 1, 11)
        for w1 in steps:
            for w2 in steps:
                if w1 + w2 > 1.0:
                    continue
                w3 = 1.0 - w1 - w2
                blended = w1 * oof_list[0] + w2 * oof_list[1] + w3 * oof_list[2]
                preds = (blended >= 0.5).astype(int)
                acc = accuracy_score(y_full, preds)
                f1 = f1_score(y_full, preds)
                results.append({"w1": float(w1), "w2": float(w2), "w3": float(w3), "accuracy": float(acc), "f1": float(f1)})
                if acc > best["accuracy"]:
                    best = {"weights": [float(w1), float(w2), float(w3)], "accuracy": float(acc), "f1": float(f1)}

    # save results
    df_results = pd.DataFrame(results)
    out_grid = OUT_DIR / f"{prefix}_blend_results.csv"
    df_results.to_csv(out_grid, index=False)
    print("Saved blend grid results to:", out_grid)
    print("Best blend:", best)

    # 7) Train top pipelines on full data and produce blended test submission if requested
    trained_pipelines = {}
    for name in top_names:
        pipe = candidates[name]
        print("Fitting pipeline on full data:", name)
        pipe.fit(X_full, y_full)
        trained_pipelines[name] = pipe
        # optionally save each pipeline
    joblib.dump(trained_pipelines, MODELS_DIR / f"{prefix}_pipelines.pkl")
    print("Saved trained pipelines to models/")

    if SAVE_SUBMISSION and test is not None:
        print("Generating blended submission on test set...")
        # compute test probs for each top model
        test_probs = []
        for name in top_names:
            p = trained_pipelines[name].predict_proba(test)[:, 1]
            test_probs.append(p)
        # apply best weights
        if len(best["weights"]) == 1:
            blended_test_probs = test_probs[0]
        elif len(best["weights"]) == 2:
            blended_test_probs = best["weights"][0] * test_probs[0] + best["weights"][1] * test_probs[1]
        else:
            blended_test_probs = np.zeros_like(test_probs[0])
            for w, p in zip(best["weights"], test_probs):
                blended_test_probs += w * p
        # produce class preds using 0.5 threshold (or consider using best["accuracy"] threshold tune)
        test_pred_labels = (blended_test_probs >= 0.5).astype(int)
        # handle PassengerId if present in test
        if "PassengerId" in test.columns:
            pid = test["PassengerId"]
        else:
            pid = np.arange(1, len(test) + 1)
        submission = pd.DataFrame({"PassengerId": pid, "Survived": test_pred_labels})
        out_sub = SUB_DIR / f"{prefix}_blend_submission.csv"
        submission.to_csv(out_sub, index=False)
        print("Saved blended submission to:", out_sub)

    # 8) Save summary
    summary = {
        "timestamp": ts,
        "top_models": top_names,
        "best_weights": best["weights"],
        "best_accuracy": best["accuracy"],
        "best_f1": best["f1"]
    }
    pd.Series(summary).to_frame("value").to_csv(OUT_DIR / f"{prefix}_summary.csv")
    print("Saved summary to outputs/")

    print("\nDONE — check outputs/, models/, submissions/")

if __name__ == "__main__":
    main()

Initial candidates loaded from models/:  []
Added quick default candidates:  ['rf_quick', 'gb_quick', 'xgb_quick', 'lgb_quick']

-> Processing candidate: rf_quick
  rf_quick CV mean: 0.8215 (+/- 0.0271)

-> Processing candidate: gb_quick
  gb_quick CV mean: 0.8294 (+/- 0.0193)

-> Processing candidate: xgb_quick
  xgb_quick CV mean: 0.8328 (+/- 0.0223)

-> Processing candidate: lgb_quick
  lgb_quick CV mean: 0.8305 (+/- 0.0207)

Saved candidate CV scores to: outputs/day22_20250929_1619_model_scores.csv
Top models selected for blending: ['xgb_quick', 'lgb_quick', 'gb_quick']

Searching weight grid for best blend...
Saved blend grid results to: outputs/day22_20250929_1619_blend_results.csv
Best blend: {'weights': [0.0, 0.0, 1.0], 'accuracy': 0.8439955106621774, 'f1': 0.785824345146379}
Fitting pipeline on full data: xgb_quick
Fitting pipeline on full data: lgb_quick
[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading