<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-25/day25_ensembling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# day25_ensembling.py
"""
Day 25 - Ensembling (Stacking / Blending) using OOF predictions
Inputs:
    - train_processed.csv (must contain 'Survived' target)
    - test_processed.csv  (must contain PassengerId/Id)
Outputs:
    - day25_submission.csv
    - day25_models.joblib
    - day25_report.json
"""

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import joblib, json
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.base import clone

RANDOM_STATE = 42
TRAIN_FILE = "train_processed.csv"
TEST_FILE = "test_processed.csv"
TARGET = "Survived"
REPORT_FILE = "day25_report.json"
MODELS_FILE = "day25_models.joblib"
SUBMISSION_FILE = "day25_submission.csv"

def find_id_column(df):
    for c in ["PassengerId", "Id", "ID", "passengerid"]:
        if c in df.columns:
            return c
    return None

def build_preprocessor(X):
    numeric_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object","category"]).columns.tolist()
    boolean_cols = X.select_dtypes(include=["bool"]).columns.tolist()

    # numeric pipeline
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    transformers = [("num", num_pipe, numeric_cols)]

    # categorical pipeline (only if categories exist)
    if categorical_cols:
        cat_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ])
        transformers.append(("cat", cat_pipe, categorical_cols))

    # boolean pipeline (only if booleans exist)
    if boolean_cols:
        bool_pipe = Pipeline([
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ])
        transformers.append(("bool", bool_pipe, boolean_cols))


    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder="drop"
    )

    return preprocessor, numeric_cols, categorical_cols + boolean_cols


def get_oof_predictions(base_models, X, y, X_test, n_splits=5):
    """
    For each base model, produce out-of-fold train predictions and averaged test predictions.
    Returns:
        meta_train: (n_samples, n_models)
        meta_test:  (n_test, n_models)
    """
    n_models = len(base_models)
    n_samples = X.shape[0]
    n_test = X_test.shape[0]
    meta_train = np.zeros((n_samples, n_models))
    meta_test = np.zeros((n_test, n_models))
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

    for i, (name, model) in enumerate(base_models):
        print(f"\nGenerating OOF for base model: {name}")
        oof_train = np.zeros(n_samples)
        test_fold_preds = np.zeros((n_test, n_splits))

        for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
            print(f"  Fold {fold+1}/{n_splits} ...")
            m = clone(model)
            m.fit(X[tr_idx], y.iloc[tr_idx])
            # predict_proba for binary class -> prob of class 1
            oof_train[val_idx] = m.predict_proba(X[val_idx])[:, 1]
            test_fold_preds[:, fold] = m.predict_proba(X_test)[:, 1]

        meta_train[:, i] = oof_train
        meta_test[:, i] = test_fold_preds.mean(axis=1)

    return meta_train, meta_test

def main():
    # ---------------------------
    # Step 1: Load
    # ---------------------------
    train = pd.read_csv(TRAIN_FILE)
    test = pd.read_csv(TEST_FILE)
    id_col = find_id_column(test)

    if TARGET not in train.columns:
        raise ValueError(f"Target column '{TARGET}' not found in {TRAIN_FILE}")

    drop_cols = [TARGET]
    if id_col and id_col in train.columns:
        drop_cols.append(id_col)

    X = train.drop(columns=drop_cols, errors="ignore")
    y = train[TARGET].copy()

    test_ids = test[id_col].copy() if id_col else pd.Series(np.arange(len(test)), name="Id")
    X_test_full = test.drop(columns=[id_col], errors="ignore")

    print("Train shape:", X.shape, "Test shape:", X_test_full.shape)

    # ---------------------------
    # Step 2: Holdout split for honest evaluation
    # ---------------------------
    X_train_full, X_hold, y_train_full, y_hold = train_test_split(
        X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
    )
    print("X_train_full:", X_train_full.shape, "X_hold:", X_hold.shape)

    # ---------------------------
    # Step 3: Preprocessing (fit on X_train_full only)
    # ---------------------------
    preprocessor, numeric_cols, categorical_cols = build_preprocessor(X_train_full)
    print("Numeric cols:", numeric_cols)
    print("Categorical cols:", categorical_cols)

    X_train_proc = preprocessor.fit_transform(X_train_full)
    X_hold_proc = preprocessor.transform(X_hold)
    X_test_proc = preprocessor.transform(X_test_full)

    # ---------------------------
    # Step 4: Base models definition
    # ---------------------------
    base_models = [
        ("rf", RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=RANDOM_STATE)),
        ("et", ExtraTreesClassifier(n_estimators=200, n_jobs=-1, random_state=RANDOM_STATE)),
        ("gb", GradientBoostingClassifier(n_estimators=200, random_state=RANDOM_STATE))
    ]

    # ---------------------------
    # Step 5: OOF on X_train_full (to train meta model)
    # ---------------------------
    meta_train, meta_test_approx = get_oof_predictions(base_models, X_train_proc, y_train_full, X_test_proc, n_splits=5)

    # ---------------------------
    # Step 6: Train meta model
    # ---------------------------
    meta_clf = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)
    meta_clf.fit(meta_train, y_train_full)
    print("\nMeta model trained on OOF meta-features.")

    # ---------------------------
    # Step 7: Evaluate on holdout
    #    Build holdout meta features by training base models on full X_train_full and predicting on X_hold
    # ---------------------------
    print("\nBuilding holdout meta-features...")
    meta_hold = np.zeros((X_hold_proc.shape[0], len(base_models)))
    for i, (name, model) in enumerate(base_models):
        m = clone(model)
        m.fit(X_train_proc, y_train_full)   # train on full 'train' part
        meta_hold[:, i] = m.predict_proba(X_hold_proc)[:, 1]

    y_hold_pred = meta_clf.predict(meta_hold)
    try:
        y_hold_proba = meta_clf.predict_proba(meta_hold)[:, 1]
    except Exception:
        y_hold_proba = None

    acc_hold = accuracy_score(y_hold, y_hold_pred)
    roc_hold = roc_auc_score(y_hold, y_hold_proba) if y_hold_proba is not None else None
    cr_hold = classification_report(y_hold, y_hold_pred)
    cm_hold = confusion_matrix(y_hold, y_hold_pred)

    print("\nHoldout evaluation:")
    print(f"Accuracy: {acc_hold:.4f}")
    if roc_hold is not None:
        print(f"ROC AUC: {roc_hold:.4f}")
    print("\nClassification report:\n", cr_hold)
    print("\nConfusion matrix:\n", cm_hold)

    # ---------------------------
    # Step 8: Final re-fit on full training set for submission
    #    - Refit preprocessor on FULL training X (X_train_full + X_hold)
    #    - Generate OOF meta-features on FULL training & test
    #    - Train final meta on those OOF features
    # ---------------------------
    print("\nRefitting preprocessor on FULL training data and building final meta-features...")
    X_full = pd.concat([X_train_full, X_hold], axis=0).reset_index(drop=True)
    y_full = pd.concat([y_train_full, y_hold], axis=0).reset_index(drop=True)

    preprocessor_full, _, _ = build_preprocessor(X_full)
    X_full_proc = preprocessor_full.fit_transform(X_full)
    X_test_proc_full = preprocessor_full.transform(X_test_full)

    meta_train_full, meta_test_final = get_oof_predictions(base_models, X_full_proc, y_full, X_test_proc_full, n_splits=5)

    final_meta_clf = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)
    final_meta_clf.fit(meta_train_full, y_full)

    final_test_probs = final_meta_clf.predict_proba(meta_test_final)[:, 1]
    final_test_preds = (final_test_probs >= 0.5).astype(int)

    # ---------------------------
    # Step 9: Save models (base models trained on FULL), meta model & preprocessor
    # ---------------------------
    print("\nRetraining base models on FULL training data for saving...")
    fitted_base_models = {}
    for name, model in base_models:
        m = clone(model)
        m.fit(X_full_proc, y_full)
        fitted_base_models[name] = m

    saved_objects = {
        "preprocessor": preprocessor_full,
        "base_models": fitted_base_models,
        "meta_model": final_meta_clf,
        "meta_model_trained_on": "OOF on full training"
    }
    joblib.dump(saved_objects, MODELS_FILE)
    print(f"Saved models & preprocessor to: {MODELS_FILE}")

    # ---------------------------
    # Step 10: Save submission & report
    # ---------------------------
    submission = pd.DataFrame({
        id_col if id_col else "Id": test_ids,
        TARGET: final_test_preds
    })
    submission.to_csv(SUBMISSION_FILE, index=False)
    print(f"Saved submission to: {SUBMISSION_FILE}")

    report = {
        "holdout_metrics": {
            "accuracy": float(acc_hold),
            "roc_auc": float(roc_hold) if roc_hold is not None else None,
            "classification_report": cr_hold,
            "confusion_matrix": cm_hold.tolist()
        },
        "models": [name for name, _ in base_models],
        "meta_model": "LogisticRegression",
        "notes": "Base models: RandomForest, ExtraTrees, GradientBoosting. Meta model trained on OOF meta-features (5-fold). Final meta trained on full OOF meta features."
    }
    with open(REPORT_FILE, "w") as f:
        json.dump(report, f, indent=2)
    print(f"Saved report to: {REPORT_FILE}")

    print("\nDone ✅")

if __name__ == "__main__":
    main()

Train shape: (891, 28) Test shape: (418, 28)
X_train_full: (712, 28) X_hold: (179, 28)
Numeric cols: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'Fare_log', 'TicketGroupSize']
Categorical cols: ['Sex_male', 'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_U', 'AgeBin_Child', 'AgeBin_MidAge', 'AgeBin_Senior', 'AgeBin_Teen']

Generating OOF for base model: rf
  Fold 1/5 ...
  Fold 2/5 ...
  Fold 3/5 ...
  Fold 4/5 ...
  Fold 5/5 ...

Generating OOF for base model: et
  Fold 1/5 ...
  Fold 2/5 ...
  Fold 3/5 ...
  Fold 4/5 ...
  Fold 5/5 ...

Generating OOF for base model: gb
  Fold 1/5 ...
  Fold 2/5 ...
  Fold 3/5 ...
  Fold 4/5 ...
  Fold 5/5 ...

Meta model trained on OOF meta-features.

Building holdout meta-features...

Holdout evaluation:
Accuracy: 0.8101
ROC AUC: 0.8264

Classification report:
               precision    recall  f1-score   support

 