<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-18/day18_stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# day18_titanic.py
# -------------------------------------------------------
# Titanic ML Challenge - Day 18 (Full Pipeline, Clean)
# -------------------------------------------------------

import pandas as pd
import numpy as np
import os
from pathlib import Path
from datetime import datetime

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# ---------------------------
# 1) Paths
# ---------------------------
DATA_DIR = Path("data/processed")
SUB_DIR = Path("submissions")
SUB_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train_processed.csv"
test_path = DATA_DIR / "test_processed.csv"


# ---------------------------
# 2) Load Data
# ---------------------------
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

X = train.drop(["Survived"], axis=1)
y = train["Survived"]
X_test = test.copy()


# ---------------------------
# 3) Preprocessing
# ---------------------------
def build_preprocessor(X_df):
    """Create preprocessing pipeline based on feature types."""

    num_features = X_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    cat_features = X_df.select_dtypes(include=["object"]).columns.tolist()

    # Drop PassengerId from preprocessing (keep it aside for submission)
    if "PassengerId" in num_features:
        num_features.remove("PassengerId")
    if "PassengerId" in cat_features:
        cat_features.remove("PassengerId")

    # Pipelines
    num_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ])

    return preprocessor


# ---------------------------
# 4) Define Models
# ---------------------------
rf = RandomForestClassifier(
    n_estimators=300, max_depth=7, random_state=42
)

gb = GradientBoostingClassifier(
    n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42
)

stack_model = StackingClassifier(
    estimators=[("rf", rf), ("gb", gb)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=-1
)


# ---------------------------
# 5) Full Pipeline
# ---------------------------
preprocessor = build_preprocessor(X)

clf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", stack_model)
])


# ---------------------------
# 6) Train / Validation
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Training model...")
clf.fit(X_train, y_train)

val_pred = clf.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)
print(f"Validation Accuracy: {val_acc:.4f}")

cv_scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy", n_jobs=-1)
print(f"CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")


# ---------------------------
# 7) Predict Test Set
# ---------------------------
test_pred = clf.predict(X_test)

# ---------------------------
# 8) Save Submission
# ---------------------------
if "PassengerId" in X_test.columns:
    passenger_id = X_test["PassengerId"]
else:
    passenger_id = np.arange(1, len(X_test) + 1)

submission = pd.DataFrame({
    "PassengerId": passenger_id,
    "Survived": test_pred
})

run_id = f"day18_stacking_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
out_path = SUB_DIR / run_id
submission.to_csv(out_path, index=False)

print(f"✅ Submission saved at: {out_path}")


Training model...
Validation Accuracy: 0.6648
CV Accuracy: 0.7274 (+/- 0.0429)
✅ Submission saved at: submissions/day18_stacking_20250925_1713.csv
