<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-28/day28_capstone_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# day28_stacking_final.py
"""
Day 28 - Stacking ensemble, advanced threshold tuning, final submission.
Inputs:
  - train_processed.csv  (must contain TARGET column, e.g. 'Survived')
  - test_processed.csv   (must contain PassengerId/Id)
Outputs:
  - day28_submission.csv
  - day28_models.joblib
  - day28_report.json
"""

import os
import json
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression

# ---------------------------
# Config / Filenames
# ---------------------------
TRAIN_FILE = "train_processed.csv"
TEST_FILE = "test_processed.csv"
TARGET = "Survived"
ID_CANDIDATES = ["PassengerId", "Id", "ID", "passengerid"]

OUTPUT_MODELS = "day28_models.joblib"
OUTPUT_SUB = "day28_submission.csv"
OUTPUT_REPORT = "day28_report.json"

RANDOM_STATE = 42

# ---------------------------
# Helpers
# ---------------------------
def find_id_col(df):
    for c in ID_CANDIDATES:
        if c in df.columns:
            return c
    return None

def version_safe_ohe():
    from packaging import version
    import sklearn
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    else:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def build_preprocessor(X_df):
    numeric_cols = X_df.select_dtypes(include=["int64","float64"]).columns.tolist()
    categorical_cols = X_df.select_dtypes(include=["object","category"]).columns.tolist()
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ])
    transformers = [("num", num_pipe, numeric_cols)]
    if categorical_cols:
        cat_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", version_safe_ohe())
        ])
        transformers.append(("cat", cat_pipe, categorical_cols))
    preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")
    return preprocessor, numeric_cols, categorical_cols

def apply_rule_to_preds(preds, X_df, rule):
    """Apply single numeric/categorical rule to predictions."""
    preds = preds.copy()
    if rule is None:
        return preds
    if rule["type"] == "numeric":
        feat = rule["feature"]
        thr = float(rule["threshold"])
        if rule["op"] == ">=":
            cond = (X_df[feat].values >= thr) & (preds == 0)
            preds[cond] = 1
        elif rule["op"] == "<=":
            cond = (X_df[feat].values <= thr) & (preds == 1)
            preds[cond] = 0
    elif rule["type"] == "categorical":
        feat = rule["feature"]
        val = rule["value"]
        if rule["op"] == "==":
            cond = (X_df[feat].values == val) & (preds == 0)
            preds[cond] = 1
    return preds

# ---------------------------
# Main
# ---------------------------
def main():
    # 1) Load data
    if not os.path.exists(TRAIN_FILE) or not os.path.exists(TEST_FILE):
        raise FileNotFoundError("Make sure train_processed.csv and test_processed.csv are in the working directory.")

    train = pd.read_csv(TRAIN_FILE)
    test = pd.read_csv(TEST_FILE)
    id_col = find_id_col(test)

    if TARGET not in train.columns:
        raise ValueError(f"Target column '{TARGET}' not found in train file.")

    drop_cols = [TARGET]
    if id_col and id_col in train.columns:
        drop_cols.append(id_col)

    X = train.drop(columns=drop_cols, errors="ignore")
    y = train[TARGET].copy()
    X_test = test.drop(columns=[id_col], errors="ignore") if id_col else test.copy()
    test_ids = test[id_col] if id_col else pd.Series(np.arange(len(test)), name="Id")

    # 2) Split train/holdout
    X_train, X_hold, y_train, y_hold = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    # 3) Preprocessing
    preprocessor, numeric_cols, categorical_cols = build_preprocessor(X_train)
    X_train_proc = preprocessor.fit_transform(X_train)
    X_hold_proc = preprocessor.transform(X_hold)
    X_test_proc = preprocessor.transform(X_test)

    # 4) Define base models
    rf = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)
    gb = GradientBoostingClassifier(n_estimators=200, random_state=RANDOM_STATE)
    mlp = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=600, random_state=RANDOM_STATE)

    # 5) Stacking ensemble
    stack_clf = StackingClassifier(
        estimators=[('rf', rf), ('gb', gb), ('mlp', mlp)],
        final_estimator=LogisticRegression(),
        cv=5,
        n_jobs=-1,
        passthrough=True
    )
    stack_clf.fit(X_train_proc, y_train)

    # 6) Evaluate holdout
    hold_proba = stack_clf.predict_proba(X_hold_proc)[:,1]
    hold_pred = (hold_proba >= 0.5).astype(int)
    base_f1 = f1_score(y_hold, hold_pred)
    base_acc = accuracy_score(y_hold, hold_pred)
    base_roc = roc_auc_score(y_hold, hold_proba)

    # 7) Threshold tuning (maximize F1)
    best_t = 0.5
    best_f1 = base_f1
    for t in np.linspace(0.1, 0.9, 81):
        pred_t = (hold_proba >= t).astype(int)
        cur_f1 = f1_score(y_hold, pred_t)
        if cur_f1 > best_f1:
            best_f1 = cur_f1
            best_t = float(t)
    # Apply tuned threshold
    hold_pred = (hold_proba >= best_t).astype(int)

    # 8) Optional: simple rule (reuse Day27 selected_rule if exists)
    selected_rule = None
    if os.path.exists("day27_models.joblib"):
        saved27 = joblib.load("day27_models.joblib")
        if "selected_rule" in saved27:
            selected_rule = saved27["selected_rule"]
            hold_pred = apply_rule_to_preds(hold_pred, X_hold, selected_rule)

    # 9) Final test predictions
    test_proba = stack_clf.predict_proba(X_test_proc)[:,1]
    test_pred = (test_proba >= best_t).astype(int)
    if selected_rule:
        test_pred = apply_rule_to_preds(test_pred, X_test, selected_rule)

    # 10) Save submission, models, report
    submission = pd.DataFrame({id_col if id_col else "Id": test_ids, TARGET: test_pred})
    submission.to_csv(OUTPUT_SUB, index=False)

    saved_obj = {
        "preprocessor": preprocessor,
        "stacking_model": stack_clf,
        "best_threshold": best_t,
        "selected_rule": selected_rule
    }
    joblib.dump(saved_obj, OUTPUT_MODELS)

    report = {
        "holdout_f1": float(best_f1),
        "holdout_acc": float(base_acc),
        "holdout_roc": float(base_roc),
        "best_threshold": best_t,
        "selected_rule": selected_rule
    }
    with open(OUTPUT_REPORT, "w") as f:
        json.dump(report, f, indent=2)

    print("Done. Outputs:")
    print("-", OUTPUT_SUB)
    print("-", OUTPUT_MODELS)
    print("-", OUTPUT_REPORT)

if __name__ == "__main__":
    main()


Done. Outputs:
- day28_submission.csv
- day28_models.joblib
- day28_report.json


In [13]:
import joblib
import pandas as pd

# Load saved Day28 stacking model
saved = joblib.load("day28_models.joblib")
stack_model = saved["stacking_model"]
preprocessor = saved["preprocessor"]
best_t = saved["best_threshold"]
selected_rule = saved.get("selected_rule", None)

# Example: preprocess input DataFrame
def predict_input(df_input):
    X_proc = preprocessor.transform(df_input)
    proba = stack_model.predict_proba(X_proc)[:,1]
    preds = (proba >= best_t).astype(int)
    if selected_rule:
        preds = apply_rule_to_preds(preds, df_input, selected_rule)
    return preds, proba


In [14]:
!pip install gradio --quiet

import gradio as gr

# Define input features (use your X columns)
feature_cols = list(preprocessor.feature_names_in_)  # all features used

def predict_gradio(*args):
    # create DataFrame from inputs
    df_input = pd.DataFrame([args], columns=feature_cols)
    preds, proba = predict_input(df_input)
    return {"Predicted Class": int(preds[0]), "Probability of 1": float(proba[0])}

# Build Gradio interface
inputs = [gr.Number(label=col) for col in feature_cols]  # adjust types if categorical
outputs = [gr.Label(num_top_classes=2)]

iface = gr.Interface(fn=predict_gradio, inputs=inputs, outputs=outputs, live=True)
iface.launch()



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://340dadc44b61f27565.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


