<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-19/day19_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# day19_titanic.py
# -------------------------------------------------------
# Titanic ML Challenge - Day 19 (Stacking + Explainability)
# -------------------------------------------------------

import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance


# ---------------------------
# 1) Paths
# ---------------------------
DATA_DIR = Path("data/processed")
MODEL_DIR = Path("models")
REPORT_DIR = Path("reports")

for p in [MODEL_DIR, REPORT_DIR]:
    p.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train_processed.csv"
test_path = DATA_DIR / "test_processed.csv"


# ---------------------------
# 2) Load data
# ---------------------------
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# 🔑 Fix: convert all bools → str upfront
for df in [train, test]:
    for col in df.select_dtypes(include=["bool"]).columns:
        df[col] = df[col].astype(str)

X = train.drop(["Survived", "PassengerId"], axis=1)
y = train["Survived"]

X_test = test.drop(["PassengerId"], axis=1)
test_passenger_id = test["PassengerId"]


# ---------------------------
# 3) Preprocessor
# ---------------------------
def build_preprocessor(X_df):
    """Build preprocessing pipeline for numerical + categorical features."""

    numeric_features = X_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X_df.select_dtypes(include=["object", "category"]).columns.tolist()

    num_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_transformer, numeric_features),
            ("cat", cat_transformer, categorical_features),
        ]
    )

    return preprocessor


# ---------------------------
# 4) Models
# ---------------------------
rf = RandomForestClassifier(
    n_estimators=300, max_depth=7, random_state=42
)

gb = GradientBoostingClassifier(
    n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42
)

base_models = [
    ("rf", rf),
    ("gb", gb),
]

stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=-1
)


# ---------------------------
# 5) Pipeline
# ---------------------------
def build_pipeline(X_df):
    preprocessor = build_preprocessor(X_df)
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("stack", stack_model)
    ])
    return pipeline


# ---------------------------
# 6) Permutation Importance
# ---------------------------
def permutation_importance_on_pipeline(pipeline, X_val, y_val, out_prefix="run"):
    result = permutation_importance(
        pipeline, X_val, y_val, n_repeats=10, random_state=42, n_jobs=-1
    )

    # Get feature names
    preprocessor = pipeline.named_steps["preprocessor"]
    feature_names = preprocessor.get_feature_names_out()

    # Create aligned DataFrame
    perm_df = pd.DataFrame({
        "feature": feature_names,
        "importance_mean": result.importances_mean,
        "importance_std": result.importances_std
    }).sort_values(by="importance_mean", ascending=False)

    out_path = REPORT_DIR / f"{out_prefix}_permutation_importance.csv"
    perm_df.to_csv(out_path, index=False)
    print(f"✅ Permutation importance saved to {out_path}")

    return perm_df


# ---------------------------
# 7) Main
# ---------------------------
def main():
    prefix = f"day19_stacking_{datetime.now().strftime('%Y%m%d_%H%M')}"

    # Split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Data shapes — train: {X_train.shape} val: {X_val.shape}")

    pipeline = build_pipeline(X)

    # Train
    print("Training stacking pipeline...")
    pipeline.fit(X_train, y_train)

    # Validation accuracy
    val_pred = pipeline.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    print(f"Validation Accuracy: {val_acc:.4f}")

    # CV accuracy
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring="accuracy", n_jobs=-1)
    print(f"CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

    # Save pipeline
    model_path = MODEL_DIR / f"{prefix}_pipeline.pkl"
    joblib.dump(pipeline, model_path)
    print(f"✅ Saved pipeline to: {model_path}")

    # Permutation importance
    permutation_importance_on_pipeline(pipeline, X_val, y_val, out_prefix=prefix)

    # Predict test set
    test_pred = pipeline.predict(X_test)
    submission = pd.DataFrame({
        "PassengerId": test_passenger_id,
        "Survived": test_pred
    })

    out_path = Path("submissions") / f"{prefix}.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    submission.to_csv(out_path, index=False)
    print(f"✅ Submission saved to {out_path}")


if __name__ == "__main__":
    main()


Data shapes — train: (712, 28) val: (179, 28)
Training stacking pipeline...
Validation Accuracy: 0.8212
CV Accuracy: 0.8316 (+/- 0.0189)
✅ Saved pipeline to: models/day19_stacking_20250926_1555_pipeline.pkl


ValueError: All arrays must be of the same length