<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-17/day17_tuning_quick.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# day17_tuning_quick.py
"""
Day 17 - Quick Hyperparameter Tuning (RandomizedSearchCV)
- Tunes RandomForest (and XGBoost/LightGBM if available)
- Designed to run fast for a 2-hour session (short n_iter)
"""

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import joblib, datetime, warnings

warnings.filterwarnings("ignore")

# ---------------- CONFIG ----------------
DATA_DIR = Path("data/processed")
TRAIN_CSV = DATA_DIR / "train_processed.csv"
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)
RANDOM_STATE = 42
CV_FOLDS = 5
N_ITER = 12   # small for quick runs; increase later if you have more time
VERBOSE = 2
N_JOBS = -1
# ----------------------------------------

# Optional imports
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

try:
    from lightgbm import LGBMClassifier
    HAS_LGB = True
except Exception:
    HAS_LGB = False

def add_features_if_missing(df):
    df = df.copy()
    if "FamilySize" not in df.columns and {"SibSp", "Parch"}.issubset(df.columns):
        df["FamilySize"] = df["SibSp"].fillna(0).astype(int) + df["Parch"].fillna(0).astype(int) + 1
    if "IsAlone" not in df.columns and "FamilySize" in df.columns:
        df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    if "Title" not in df.columns and "Name" in df.columns:
        df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.", expand=False).str.strip().fillna("Other")
    if "FarePerPerson" not in df.columns and "Fare" in df.columns:
        df["FarePerPerson"] = df["Fare"].fillna(0) / df.get("FamilySize", 1).replace(0,1)
    return df

def encode_and_impute(X_df):
    X_enc = pd.get_dummies(X_df, dummy_na=False)
    imputer = SimpleImputer(strategy="median")
    X_imp = pd.DataFrame(imputer.fit_transform(X_enc), columns=X_enc.columns)
    return X_imp, imputer, X_enc.columns.tolist()

def cv_mean_std(clf, X, y, folds=CV_FOLDS):
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy", n_jobs=N_JOBS)
    return float(scores.mean()), float(scores.std()), scores

def run_random_search(estimator, param_dist, X, y, name="model"):
    print(f"\n--- RandomizedSearchCV for {name} (n_iter={N_ITER}) ---")
    cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    rs = RandomizedSearchCV(
        estimator,
        param_distributions=param_dist,
        n_iter=N_ITER,
        cv=cv,
        scoring="accuracy",
        random_state=RANDOM_STATE,
        verbose=VERBOSE,
        n_jobs=N_JOBS
    )
    rs.fit(X, y)
    print(f"Best CV score ({name}): {rs.best_score_:.4f}")
    print(f"Best params ({name}): {rs.best_params_}")
    # Save
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    joblib.dump(rs, MODELS_DIR / f"{name}_randomsearch_{ts}.pkl")
    print(f"Saved RandomizedSearchCV object to models/{name}_randomsearch_{ts}.pkl")
    return rs

def main():
    print("Day 17 tuning — quick run")
    train = pd.read_csv(TRAIN_CSV)
    if "Survived" not in train.columns:
        raise SystemExit("train_processed.csv must contain 'Survived' column")
    # Add features if missing (safe)
    train = add_features_if_missing(train)
    X_df = train.drop(["Survived"], axis=1, errors="ignore")
    y = train["Survived"].astype(int)

    # Encode & impute once (fast approach)
    X, imputer, col_list = encode_and_impute(X_df)
    print("Shape after encoding/imputation:", X.shape)

    # Baseline quick RF CV
    base_rf = RandomForestClassifier(n_estimators=150, random_state=RANDOM_STATE, n_jobs=N_JOBS)
    base_mean, base_std, base_scores = cv_mean_std(base_rf, X, y)
    print(f"\nBaseline RF CV accuracy: {base_mean:.4f} (+/- {base_std:.4f})")

    # -------- RandomForest tuning --------
    rf_param_dist = {
        "n_estimators": [100, 200, 300, 400, 600],
        "max_depth": [None, 5, 8, 12, 20],
        "min_samples_split": [2, 4, 8],
        "min_samples_leaf": [1, 2, 4],
        "bootstrap": [True, False]
    }
    rf_rs = run_random_search(RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=N_JOBS), rf_param_dist, X, y, name="random_forest")

    # -------- XGBoost tuning (if available) --------
    if HAS_XGB:
        xgb_param_dist = {
            "n_estimators": [100,200,300,400],
            "learning_rate": [0.01,0.03,0.05,0.1],
            "max_depth": [3,4,5,6],
            "subsample": [0.6,0.8,1.0],
            "colsample_bytree": [0.6,0.8,1.0]
        }
        xgb_rs = run_random_search(XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=RANDOM_STATE, n_jobs=1), xgb_param_dist, X, y, name="xgboost")
    else:
        print("\nXGBoost not installed — skipping XGB tuning.")

    # -------- LightGBM tuning (if available) --------
    if HAS_LGB:
        lgb_param_dist = {
            "n_estimators": [100,200,300,400],
            "learning_rate": [0.01,0.03,0.05,0.1],
            "num_leaves": [15,31,63],
            "feature_fraction": [0.6,0.8,1.0]
        }
        lgb_rs = run_random_search(LGBMClassifier(random_state=RANDOM_STATE, n_jobs=1), lgb_param_dist, X, y, name="lightgbm")
    else:
        print("\nLightGBM not installed — skipping LGB tuning.")

    print("\nDone. Models saved in 'models/' directory.")
    print("Next: • Use best estimators from these RandomizedSearchCV objects in your stacking pipeline.")
    print("Tip: If run time is long, reduce N_ITER or n_estimators and try again.")

if __name__ == "__main__":
    main()


Day 17 tuning — quick run
Shape after encoding/imputation: (891, 30)

Baseline RF CV accuracy: 0.8238 (+/- 0.0212)

--- RandomizedSearchCV for random_forest (n_iter=12) ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best CV score (random_forest): 0.8451
Best params (random_forest): {'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_depth': 12, 'bootstrap': True}
Saved RandomizedSearchCV object to models/random_forest_randomsearch_20250924_1443.pkl

--- RandomizedSearchCV for xgboost (n_iter=12) ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best CV score (xgboost): 0.8485
Best params (xgboost): {'subsample': 0.6, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
Saved RandomizedSearchCV object to models/xgboost_randomsearch_20250924_1443.pkl

--- RandomizedSearchCV for lightgbm (n_iter=12) ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 34