<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-20/day20_shap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# day20_shap.py
# Day 20 — SHAP explainability, permutation importance fallback, threshold sweep
# Save as day20_shap.py and run: python day20_shap.py

import os
import glob
import joblib
import warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from packaging import version
import sklearn

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, brier_score_loss
from sklearn.inspection import permutation_importance
from sklearn.calibration import CalibratedClassifierCV

warnings.filterwarnings("ignore")

# ---------------- CONFIG ----------------
DATA_DIR = Path("data/processed")
TRAIN_CSV = DATA_DIR / "train_processed.csv"
TEST_CSV = DATA_DIR / "test_processed.csv"

MODELS_DIR = Path("models")
OUT_DIR = Path("outputs")
MODELS_DIR.mkdir(exist_ok=True)
OUT_DIR.mkdir(exist_ok=True)

RANDOM_STATE = 42
HOLDOUT_SIZE = 0.20
CV_FOLDS = 5
# ---------------------------------------

# safe OHE factory for sklearn versions
def make_ohe():
    from sklearn.preprocessing import OneHotEncoder
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    else:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def find_latest_pipeline():
    files = sorted(glob.glob(str(MODELS_DIR / "*stack*.pkl")))
    if not files:
        return None
    return files[-1]

def get_preprocessor_from_pipeline(pipeline):
    for name in ("preprocessor", "pre", "preprocessing", "prep"):
        if name in pipeline.named_steps:
            return pipeline.named_steps[name]
    # fallback: search for a ColumnTransformer in named_steps
    for step in pipeline.named_steps.values():
        if isinstance(step, ColumnTransformer):
            return step
    return None

def get_stack_from_pipeline(pipeline):
    for name in ("stack", "stacking", "stacker", "stackingclassifier", "stacking_clf"):
        if name in pipeline.named_steps:
            return pipeline.named_steps[name]
    # fallback search for StackingClassifier instance
    for step in pipeline.named_steps.values():
        if hasattr(step, "estimators_") or step.__class__.__name__.lower().startswith("stack"):
            return step
    return None

def build_quick_pipeline(X_df):
    # ensure bools converted to strings (avoid imputer dtype issues)
    X_copy = X_df.copy()
    for col in X_copy.select_dtypes(include=["bool"]).columns:
        X_copy[col] = X_copy[col].astype(str)

    numeric_features = X_copy.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X_copy.select_dtypes(include=["object", "category"]).columns.tolist()

    # remove passenger id if present
    for c in ("PassengerId",):
        if c in numeric_features:
            numeric_features.remove(c)
        if c in categorical_features:
            categorical_features.remove(c)

    num_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
        ("ohe", make_ohe())
    ])
    preprocessor = ColumnTransformer([
        ("num", num_transformer, numeric_features),
        ("cat", cat_transformer, categorical_features)
    ], remainder="drop")

    rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
    gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=RANDOM_STATE)

    stack = StackingClassifier(
        estimators=[("rf", rf), ("gb", gb)],
        final_estimator=LogisticRegression(max_iter=2000),
        passthrough=True,
        cv=5, n_jobs=-1
    )

    pipeline = Pipeline([
        ("pre", preprocessor),
        ("stack", stack)
    ])
    return pipeline

def safe_get_feature_names(preprocessor, X_df):
    # Try several ways to retrieve feature names after ColumnTransformer
    try:
        # preferred: pass input feature names
        return list(preprocessor.get_feature_names_out(X_df.columns))
    except Exception:
        try:
            return list(preprocessor.get_feature_names_out())
        except Exception:
            # fallback: return generic f0..fN
            # we can estimate length by transforming one row
            try:
                arr = preprocessor.transform(X_df.iloc[:1])
                return [f"f{i}" for i in range(arr.shape[1])]
            except Exception:
                return []

def try_shap_on_rf(rf_model, preprocessor, X_val, feature_names, out_prefix):
    """Attempt SHAP TreeExplainer on RF model. Save CSV + plot. Return True on success."""
    try:
        import shap
    except Exception as e:
        print("shap not installed or import failed:", e)
        return False

    try:
        # Transform validation data
        X_val_pre = preprocessor.transform(X_val)
        # Ensure numpy float
        X_val_pre = np.asarray(X_val_pre, dtype=float)

        # TreeExplainer accepts the tree model and raw numpy
        explainer = shap.TreeExplainer(rf_model)
        shap_values = explainer.shap_values(X_val_pre)  # list for classes or array

        # For binary classifier shap_values is list-like: take class 1
        if isinstance(shap_values, list) and len(shap_values) > 1:
            shap_vals = shap_values[1]
        else:
            shap_vals = shap_values

        # mean abs shap
        mean_abs = np.abs(shap_vals).mean(axis=0)
        df = pd.DataFrame({"feature": feature_names, "mean_abs_shap": mean_abs})
        df = df.sort_values("mean_abs_shap", ascending=False)
        out_csv = OUT_DIR / f"{out_prefix}_shap_importance.csv"
        df.to_csv(out_csv, index=False)
        print("Saved SHAP importances to:", out_csv)

        # produce a summary plot and save
        plt.figure(figsize=(8, 8))
        shap.summary_plot(shap_vals, features=X_val_pre, feature_names=feature_names, show=False)
        plt.tight_layout()
        out_png = OUT_DIR / f"{out_prefix}_shap_summary.png"
        plt.savefig(out_png, dpi=200)
        plt.close()
        print("Saved SHAP summary plot to:", out_png)
        return True
    except Exception as e:
        print("SHAP TreeExplainer failed:", e)
        return False

def permutation_importance_and_save(pipeline, X_val, y_val, preprocessor, X_df, out_prefix):
    print("Computing permutation importance (model-agnostic)...")
    r = permutation_importance(pipeline, X_val, y_val, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)
    # feature names
    feat_names = safe_get_feature_names(preprocessor, X_df)
    # align length
    L = len(r.importances_mean)
    if len(feat_names) != L:
        # fallback name generation
        feat_names = [f"f{i}" for i in range(L)]
    df = pd.DataFrame({
        "feature": feat_names,
        "importance_mean": r.importances_mean,
        "importance_std": r.importances_std
    }).sort_values("importance_mean", ascending=False)
    out_csv = OUT_DIR / f"{out_prefix}_permutation_importance.csv"
    df.to_csv(out_csv, index=False)
    print("Saved permutation importance to:", out_csv)
    return df

def threshold_sweep(pipeline, X_val, y_val, out_prefix):
    print("Running threshold sweep on validation set...")
    # get probabilities
    if hasattr(pipeline, "predict_proba"):
        probs = pipeline.predict_proba(X_val)[:, 1]
    else:
        # try accessing inner stack
        probs = pipeline.predict_proba(X_val)[:, 1]

    thresholds = np.linspace(0.05, 0.95, 19)
    recs = []
    best = {"threshold": None, "accuracy": -1, "f1": -1}
    for t in thresholds:
        preds = (probs >= t).astype(int)
        acc = accuracy_score(y_val, preds)
        f1 = f1_score(y_val, preds)
        recs.append({"threshold": float(t), "accuracy": float(acc), "f1": float(f1)})
        if acc > best["accuracy"]:
            best = {"threshold": float(t), "accuracy": float(acc), "f1": float(f1)}
    df = pd.DataFrame(recs)
    out_csv = OUT_DIR / f"{out_prefix}_thresholds.csv"
    df.to_csv(out_csv, index=False)
    print("Saved threshold sweep to:", out_csv)
    print("Best threshold (by accuracy):", best)
    return best, df

def calibrate_pipeline_and_eval(pipeline, X_train, y_train, X_val, y_val, out_prefix):
    print("Calibrating pipeline (CalibratedClassifierCV with sigmoid)...")
    try:
        cal = CalibratedClassifierCV(base_estimator=pipeline, method="sigmoid", cv=3)
        cal.fit(X_train, y_train)
        probs = cal.predict_proba(X_val)[:, 1]
        preds = (probs >= 0.5).astype(int)
        acc = accuracy_score(y_val, preds)
        brier = brier_score_loss(y_val, probs)
        metrics = {"calibrated_accuracy": float(acc), "brier_score": float(brier)}
        out_csv = OUT_DIR / f"{out_prefix}_calibration_metrics.csv"
        pd.Series(metrics).to_frame("value").to_csv(out_csv)
        joblib.dump(cal, MODELS_DIR / f"{out_prefix}_calibrated.pkl")
        print("Saved calibration metrics and calibrated model.")
        return metrics
    except Exception as e:
        print("Calibration failed:", e)
        return None

# ========== MAIN ==========
def main():
    ts = datetime.now().strftime("%Y%m%d_%H%M")
    prefix = f"day20_{ts}"

    # load data
    train = pd.read_csv(TRAIN_CSV)
    test = pd.read_csv(TEST_CSV)

    # safety: convert bools -> str early
    for df in (train, test):
        for c in df.select_dtypes(include=["bool"]).columns:
            df[c] = df[c].astype(str)

    if "Survived" not in train.columns:
        raise SystemExit("train_processed.csv must contain 'Survived' column")

    X_full = train.drop(columns=["Survived"])
    y_full = train["Survived"].astype(int)

    # holdout split
    X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=HOLDOUT_SIZE, stratify=y_full, random_state=RANDOM_STATE)
    print("Data shapes — train:", X_train.shape, "val:", X_val.shape)

    # 1) Load or train pipeline
    latest = find_latest_pipeline()
    if latest:
        print("Loading pipeline:", latest)
        pipeline = joblib.load(latest)
        pipeline_source = latest
    else:
        print("No saved pipeline found. Training a quick stacking pipeline...")
        pipeline = build_quick_pipeline(X_train)
        pipeline.fit(X_train, y_train)
        save_path = MODELS_DIR / f"{prefix}_stacking_pipeline.pkl"
        joblib.dump(pipeline, save_path)
        pipeline_source = str(save_path)
        print("Saved new pipeline to:", save_path)

    # get preprocessor and stack
    pre = get_preprocessor_from_pipeline(pipeline)
    stack = get_stack_from_pipeline(pipeline)

    if pre is None:
        raise SystemExit("Could not find preprocessor inside pipeline (expected ColumnTransformer).")

    # feature names
    feat_names = safe_get_feature_names(pre, X_train)
    if len(feat_names) == 0:
        print("Warning: couldn't determine feature names. Some outputs may use generic names.")

    # 2) Try SHAP on RF inside stack
    rf_model = None
    try:
        if hasattr(stack, "named_estimators_"):
            rf_model = stack.named_estimators_.get("rf", None)
        # fallback: if estimators_ attribute
        if rf_model is None and hasattr(stack, "estimators_"):
            # estimators_ is list of fitted models
            for name, est in stack.estimators_:
                if name == "rf":
                    rf_model = est
                    break
    except Exception:
        rf_model = None

    shap_success = False
    if rf_model is not None:
        print("Trying SHAP on RandomForest base model...")
        try:
            shap_success = try_shap_on_rf(rf_model, pre, X_val, feat_names, out_prefix=prefix)
        except Exception as e:
            print("SHAP attempt raised an error:", e)
            shap_success = False
    else:
        print("No RF found inside stacked model — skipping SHAP attempt.")

    # 3) If SHAP failed or unavailable, compute permutation importance
    if not shap_success:
        print("SHAP not used or failed — computing permutation importance instead.")
        perm_df = permutation_importance_and_save(pipeline, X_val, y_val, pre, X_val, out_prefix=prefix)
    else:
        perm_df = None

    # 4) Threshold sweep and save
    best_threshold, thresholds_df = threshold_sweep(pipeline, X_val, y_val, out_prefix=prefix)

    # 5) Optional calibration (quick)
    calibration = calibrate_pipeline_and_eval(pipeline, X_train, y_train, X_val, y_val, out_prefix=prefix)

    # 6) Save summary
    summary = {
        "timestamp": ts,
        "pipeline_source": pipeline_source,
        "shap_used": bool(shap_success),
        "best_threshold": best_threshold,
        "calibration": calibration
    }
    pd.Series(summary).to_frame("value").to_csv(OUT_DIR / f"{prefix}_summary.csv")
    print("Saved summary to:", OUT_DIR / f"{prefix}_summary.csv")

    print("\nDONE — check outputs/ for CSVs and plots, models/ for pipeline (if created).")

if __name__ == "__main__":
    main()


Data shapes — train: (712, 29) val: (179, 29)
No saved pipeline found. Training a quick stacking pipeline...
Saved new pipeline to: models/day20_20250927_1501_stacking_pipeline.pkl
Trying SHAP on RandomForest base model...
SHAP TreeExplainer failed: Per-column arrays must each be 1-dimensional
SHAP not used or failed — computing permutation importance instead.
Computing permutation importance (model-agnostic)...
Saved permutation importance to: outputs/day20_20250927_1501_permutation_importance.csv
Running threshold sweep on validation set...
Saved threshold sweep to: outputs/day20_20250927_1501_thresholds.csv
Best threshold (by accuracy): {'threshold': 0.49999999999999994, 'accuracy': 0.8491620111731844, 'f1': 0.7969924812030075}
Calibrating pipeline (CalibratedClassifierCV with sigmoid)...
Calibration failed: CalibratedClassifierCV.__init__() got an unexpected keyword argument 'base_estimator'
Saved summary to: outputs/day20_20250927_1501_summary.csv

DONE — check outputs/ for CSVs a