<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-11/day11_cv_grid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# day11_cv_grid.py
# Usage: python day11_cv_grid.py
# Outputs:
#  - day11/day11_gridsearch_results.csv
#  - day11/day11_results.json
#  - day11/assets/best_model.joblib
#  - day11/assets/plots/*

import warnings, json
warnings.filterwarnings("ignore")

from pathlib import Path
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import (train_test_split, StratifiedKFold,
                                     cross_val_score, GridSearchCV, RandomizedSearchCV)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, roc_curve, classification_report)
import joblib

OUT = Path("day11"); OUT.mkdir(exist_ok=True)
ASSETS = OUT/"assets"; ASSETS.mkdir(exist_ok=True)

# ---- 0) Load processed data (use best available)
candidates = [
    Path("day10/day10_titanic_feat.csv"),
    Path("day05/day05_titanic_feat.csv"),
    Path("day02/day02_titanic_clean.csv"),
    Path("train.csv")
]
df = None
for p in candidates:
    if p.exists():
        print("Loading:", p)
        df = pd.read_csv(p)
        break
if df is None:
    raise FileNotFoundError("No input CSV found. Place processed file in one of the candidate paths.")

# ---- 1) Prepare X, y (safe)
if 'Survived' not in df.columns:
    raise KeyError("Dataset must contain 'Survived' column.")

# drop obvious meta columns (if present)
for c in ['PassengerId','Ticket','Cabin','Name']:
    if c in df.columns: df.drop(columns=[c], inplace=True)

# If categorical strings remain, one-hot them quickly (safe)
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [c for c in cat_cols if c != 'Survived']
if cat_cols:
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

X = df.drop(columns=['Survived'])
y = df['Survived']

# split a holdout test set (optional but recommended)
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

# ---- 2) Quick CV baseline scores
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
}
print("\nBaseline cross_val_score (f1):")
for name, mdl in models.items():
    scores = cross_val_score(mdl, X_train_full, y_train_full, cv=cv, scoring='f1', n_jobs=-1)
    print(f" {name}: mean f1 = {scores.mean():.4f}, std = {scores.std():.4f}")

# ---- 3) Build a simple pipeline (preprocessing here is trivial since features are numeric after Day10)
# If you have a preprocessor saved from Day10, you can load it and use it here.
pipeline_rf = Pipeline([
    ("scaler", StandardScaler()),  # numeric scaling often helps some models
    ("clf", RandomForestClassifier(random_state=42))
])

# ---- 4) GridSearchCV (small grid) for RandomForest
param_grid = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [4, 6, None],
    "clf__max_features": ["sqrt", "log2"]
}
gs = GridSearchCV(pipeline_rf, param_grid, cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=42),
                  scoring='f1', n_jobs=-1, verbose=1, return_train_score=True)
print("\nRunning GridSearchCV (this may take a few minutes)...")
gs.fit(X_train_full, y_train_full)

# Save full cv results to CSV (Day5 style)
pd.DataFrame(gs.cv_results_).to_csv(OUT / "day11_gridsearch_results.csv", index=False)
print("GridSearchCV results saved to:", OUT / "day11_gridsearch_results.csv")

# Best params & CV score
best_params = gs.best_params_
best_score = gs.best_score_
print("Best params:", best_params)
print("Best CV f1:", best_score)

# Evaluate best model on holdout test
best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1] if hasattr(best_model, "predict_proba") else None

metrics = {
    "accuracy": float(accuracy_score(y_test, y_pred)),
    "precision": float(precision_score(y_test, y_pred)),
    "recall": float(recall_score(y_test, y_pred)),
    "f1": float(f1_score(y_test, y_pred)),
    "roc_auc": float(roc_auc_score(y_test, y_proba)) if y_proba is not None else None
}
print("\nTest metrics:", json.dumps(metrics, indent=2))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Save model & results summary
joblib.dump(best_model, ASSETS / "best_model.joblib")
with open(OUT / "day11_results.json", "w") as f:
    json.dump({"best_params": best_params, "cv_best_score": best_score, "test_metrics": metrics}, f, indent=2)

# ---- 5) Optional: RandomizedSearchCV (wider but faster)
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    "clf__n_estimators": [50,100,200,300],
    "clf__max_depth": [3,4,6,8, None],
    "clf__max_features": ["sqrt","log2", None],
    "clf__min_samples_split": [2,5,10]
}
rs = RandomizedSearchCV(pipeline_rf, param_distributions=param_dist, n_iter=12, cv=3, scoring='f1', n_jobs=-1, random_state=42, verbose=1)
print("\nRunning RandomizedSearchCV (optional)...")
rs.fit(X_train_full, y_train_full)
pd.DataFrame(rs.cv_results_).to_csv(OUT / "day11_randomsearch_results.csv", index=False)
print("RandomSearch results saved to:", OUT / "day11_randomsearch_results.csv")
joblib.dump(rs.best_estimator_, ASSETS / "best_model_randomsearch.joblib")

# ---- 6) Quick plots: best confusion matrix & ROC (if proba)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,3)); sns.heatmap(cm, annot=True, fmt='d'); plt.title("Best Model Confusion"); plt.savefig(ASSETS / "best_confusion.png"); plt.close()

if y_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(5,4)); plt.plot(fpr,tpr,label=f"AUC={metrics['roc_auc']:.3f}"); plt.plot([0,1],[0,1],'--',color='gray'); plt.legend(); plt.title("ROC"); plt.savefig(ASSETS / "best_roc.png"); plt.close()

print("Saved models and plots to", ASSETS)
print("Day 11 complete ✅")
