<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-15/day15_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# day15_titanic.py
# -------------------------------------------------------
# Titanic ML Challenge - Day 15 (Full Pipeline in One File)
# -------------------------------------------------------

import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import numpy as np

# ---------------------------
# 1) Paths
# ---------------------------
DATA_DIR = Path("data/processed")
SUB_DIR = Path("submissions")
SUB_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train_processed.csv"
test_path = DATA_DIR / "test_processed.csv"

# ---------------------------
# 2) Load data
# ---------------------------
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Save PassengerId separately
train_passenger_id = train["PassengerId"].copy()
test_passenger_id = test["PassengerId"].copy()

# Features + target
X = train.drop(["Survived", "PassengerId"], axis=1)
y = train["Survived"]
X_test = test.drop(["PassengerId"], axis=1)

# One-hot encode (if categorical features exist like Embarked)
if "Embarked" in X.columns:
    X = pd.get_dummies(X, columns=["Embarked"], dummy_na=False)
    X_test = pd.get_dummies(X_test, columns=["Embarked"], dummy_na=False)

# Align columns (important after one-hot encoding)
X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

# ---------------------------
# 3) Handle Missing Values
# ---------------------------
imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# ---------------------------
# 4) Train/Validation split
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# 5) Define base models
# ---------------------------
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=7,
    random_state=42
)

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

base_models = [
    ("rf", rf),
    ("gb", gb),
]

# ---------------------------
# 6) Stacking model
# ---------------------------
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(max_iter=1000),
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# ---------------------------
# 7) Train & Evaluate
# ---------------------------
print("Training stacking model...")
stack_model.fit(X_train, y_train)

# Validation prediction
val_pred = stack_model.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)
print(f"Validation Accuracy: {val_acc:.4f}")

# Cross-validation score
cv_scores = cross_val_score(stack_model, X, y, cv=5, scoring="accuracy", n_jobs=-1)
print(f"CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

# ---------------------------
# 8) Predict on test set
# ---------------------------
test_pred = stack_model.predict(X_test)

# ---------------------------
# 9) Save submission
# ---------------------------
submission = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Survived": test_pred
})

out_path = SUB_DIR / "day15_stacking.csv"
submission.to_csv(out_path, index=False)

print("✅ Submission saved at:", out_path)


Training stacking model...
Validation Accuracy: 0.8101
CV Accuracy: 0.8294 (+/- 0.0184)
✅ Submission saved at: submissions/day15_stacking.csv
