<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-6/day06_ensemblesipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# day06_ensembles.py
# Run from your project root. Creates day06/assets/ with plots & saves models/results.

import os
from pathlib import Path
import json
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
import joblib

# Optional xgboost (try import)
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except Exception:
    XGBOOST_AVAILABLE = False

# -------------------------
# I/O & folders
# -------------------------
OUT = Path("day06")
ASSETS = OUT / "assets"
OUT.mkdir(exist_ok=True)
ASSETS.mkdir(exist_ok=True)

# Try to load best processed features first, fallback to earlier cleaned files
processed_candidates = [Path("day05/day05_titanic_feat.csv"), Path("day05_titanic_feat.csv")]
clean_candidates = [Path("day02/day02_titanic_clean.csv"), Path("day02/day02_titanic_preserved.csv"), Path("day02/train.csv"), Path("train.csv")]

data = None
for p in processed_candidates:
    if p.exists():
        print("Using processed features:", p)
        data = pd.read_csv(p)
        break

if data is None:
    # fallback to cleaned file and do quick preprocessing (safe)
    for p in clean_candidates:
        if p.exists():
            print("Loading cleaned/raw file:", p)
            raw = pd.read_csv(p)
            data = raw.copy()
            break

if data is None:
    raise FileNotFoundError("No data found. Place 'day05/day05_titanic_feat.csv' or 'day02/day02_titanic_clean.csv' or upload 'train.csv' in project root.")

# If loaded processed file already includes target 'Survived'
if 'Survived' not in data.columns:
    # try to find Survived in candidate raw set - if not found, error
    raise KeyError("Loaded dataset does not contain 'Survived' column. Ensure you pass processed file with target.")

# -------------------------
# Prepare X, y
# -------------------------
# If processed (many numeric columns), use as is; else do safe minimal transforms
def prepare_features(df):
    df = df.copy()
    # drop obvious unused columns if present
    drop_cols = ['PassengerId','Ticket','Cabin','Name']
    for c in drop_cols:
        if c in df.columns:
            df.drop(columns=[c], inplace=True)
    # If Sex is present as string, convert to binary
    if 'Sex' in df.columns and df['Sex'].dtype == object:
        df['Sex'] = (df['Sex'].str.lower().str.startswith('m')).astype(int)
    # Fill Age/Fare missing if any
    if 'Age' in df.columns:
        df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
        df['Age'].fillna(df['Age'].median(), inplace=True)
    if 'Fare' in df.columns:
        df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')
        df['Fare'].fillna(df['Fare'].median(), inplace=True)
    # One-hot encode remaining categorical (safe)
    cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
    cat_cols = [c for c in cat_cols if c != 'Survived']
    if len(cat_cols) > 0:
        df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df

df = prepare_features(data)
print("Prepared data shape:", df.shape)

# Separate X,y
y = df['Survived']
X = df.drop(columns=['Survived'])

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)
print("Train/Test:", X_train.shape, X_test.shape)

# -------------------------
# Helper: evaluate & save metrics
# -------------------------
def evaluate_model(name, model, X_test, y_test, save_prefix):
    y_pred = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None

    metrics = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision": float(precision_score(y_test, y_pred)),
        "recall": float(recall_score(y_test, y_pred)),
        "f1": float(f1_score(y_test, y_pred)),
        "roc_auc": float(roc_auc_score(y_test, proba)) if proba is not None else None
    }
    print(f"\n=== {name} ===")
    print("metrics:", metrics)
    print("\nClassification report:\n", classification_report(y_test, y_pred))

    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(ASSETS / f"{save_prefix}_confusion.png")
    plt.close()

    # ROC plot (if proba)
    if proba is not None:
        fpr, tpr, _ = roc_curve(y_test, proba)
        plt.figure(figsize=(5,4))
        plt.plot(fpr, tpr, label=f"{name} (AUC={metrics['roc_auc']:.3f})")
        plt.plot([0,1], [0,1], '--', color='grey')
        plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title(f"{name} ROC")
        plt.legend()
        plt.tight_layout()
        plt.savefig(ASSETS / f"{save_prefix}_roc.png")
        plt.close()

    return metrics

results = {}

# -------------------------
# 1) Random Forest baseline
# -------------------------
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
results['random_forest'] = evaluate_model("RandomForest", rf, X_test, y_test, "rf")

# Save RF model
joblib.dump(rf, ASSETS / "rf_model.joblib")

# Feature importances (top 20)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
imp_top = importances.head(20)
plt.figure(figsize=(6,8))
sns.barplot(x=imp_top.values, y=imp_top.index)
plt.title("RF Top Feature Importances")
plt.tight_layout()
plt.savefig(ASSETS / "rf_feature_importances.png")
plt.close()

# -------------------------
# 2) Gradient Boosting (sklearn)
# -------------------------
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
results['gradient_boosting'] = evaluate_model("GradientBoosting", gb, X_test, y_test, "gb")
joblib.dump(gb, ASSETS / "gb_model.joblib")

# -------------------------
# 3) XGBoost (if available)
# -------------------------
if XGBOOST_AVAILABLE:
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
    xgb.fit(X_train, y_train)
    results['xgboost'] = evaluate_model("XGBoost", xgb, X_test, y_test, "xgb")
    joblib.dump(xgb, ASSETS / "xgb_model.joblib")
else:
    print("XGBoost not installed. To enable, run: pip install xgboost")

# -------------------------
# 4) Stacking ensemble (use RF + GB + optionally XGB)
# -------------------------
estimators = [('rf', rf), ('gb', gb)]
if XGBOOST_AVAILABLE:
    estimators.append(('xgb', xgb))

stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=1000), n_jobs=-1)
stack.fit(X_train, y_train)
results['stacking'] = evaluate_model("Stacking", stack, X_test, y_test, "stack")
joblib.dump(stack, ASSETS / "stack_model.joblib")

# -------------------------
# 5) Summarize & save results
# -------------------------
with open(OUT / "day06_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("\nAll done. Results saved to:", OUT)
print("Assets (plots & models) saved to:", ASSETS)
print("Quick look at results:", results)


=== Day 2 cleaning script starting ===
Found train file at: /content/train.csv
Raw data shape: (891, 12)
Saved cleaned (minimal) dataset -> day02/day02_titanic_clean.csv (shape: (891, 12))
Saved preserved dataset -> day02/day02_titanic_preserved.csv (shape: (891, 15))

--- Quick checks on cleaned file ---
Columns: ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Sex_bin', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']
Missing values per column:
 PassengerId    0
Survived       0
Pclass         0
Sex            0
Sex_bin        0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
FamilySize     0
IsAlone        0
dtype: int64

Sample rows:
  PassengerId  Survived  Pclass    Sex  Sex_bin  Age  SibSp  Parch    Fare Embarked  FamilySize  IsAlone
           1         0       3   male        1 22.0      1      0  7.2500        S           2        0
           2         1       1 female        0 38.0      1      0 71.2833        C        

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett