<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-21/day21_feature_refine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# day21_feature_refine.py
"""
Day 21 - Feature refinement (2-hour focused script)
- Uses previous SHAP / permutation importance if available
- Creates a small set of interaction features from top K features
- Compares baseline CV vs refined CV with RandomForest
- Saves artifacts to outputs/ and models/
"""

import os
import glob
import joblib
import itertools
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

# ---------- CONFIG ----------
DATA_DIR = Path("data/processed")
OUT_DIR = Path("outputs")
MODEL_DIR = Path("models")
OUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_CSV = DATA_DIR / "train_processed.csv"
TEST_CSV = DATA_DIR / "test_processed.csv"   # optional

RANDOM_STATE = 42
CV_FOLDS = 5
TOP_K = 10           # how many top features to consider for interactions
MAX_INTERACTIONS = 15  # limit pairwise interactions created
RF_ESTIMATORS = 150  # keep small for speed; increase later
# ----------------------------

def load_data():
    train = pd.read_csv(TRAIN_CSV)
    test = pd.read_csv(TEST_CSV) if TEST_CSV.exists() else None

    # Safety: convert bools -> str to avoid imputer dtype issues
    for df in [train, test] if test is not None else [train]:
        bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
        for c in bool_cols:
            df[c] = df[c].astype(str)

    return train, test

def encode_and_impute(train_df, test_df=None):
    """One-hot encode via pandas.get_dummies and median-impute numeric NaNs.
       Returns (X_train_enc_df, X_test_enc_df or None, imputer)."""
    # drop Survived if present in train_df
    X_train = train_df.copy()
    if "Survived" in X_train.columns:
        X_train = X_train.drop(columns=["Survived"])
    # use get_dummies for categorical encoding (safe)
    X_train_enc = pd.get_dummies(X_train, dummy_na=False)

    X_test_enc = None
    if test_df is not None:
        X_test = test_df.copy()
        X_test_enc = pd.get_dummies(X_test, dummy_na=False)
        # align columns
        X_train_enc, X_test_enc = X_train_enc.align(X_test_enc, join="left", axis=1, fill_value=0)

    # Impute medians
    imputer = SimpleImputer(strategy="median")
    X_train_imp = pd.DataFrame(imputer.fit_transform(X_train_enc), columns=X_train_enc.columns)
    if X_test_enc is not None:
        X_test_imp = pd.DataFrame(imputer.transform(X_test_enc), columns=X_test_enc.columns)
    else:
        X_test_imp = None

    return X_train_imp, X_test_imp, imputer

def cv_score_rf(X, y, n_estimators=RF_ESTIMATORS):
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=RANDOM_STATE, n_jobs=-1)
    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(rf, X, y, cv=skf, scoring="accuracy", n_jobs=-1)
    return float(scores.mean()), float(scores.std()), scores

def find_latest_importance():
    """Look for latest SHAP or permutation importance CSV in outputs/"""
    patterns = ["outputs/*shap_importance*.csv", "outputs/*permutation_importance*.csv", "reports/*permutation_importance*.csv", "reports/*shap_importance*.csv"]
    files = []
    for p in patterns:
        files.extend(glob.glob(p))
    files = sorted(files)
    return files[-1] if files else None

def quick_feature_importance_from_rf(X, y):
    """Fit a quick RF and return feature importances (DataFrame)."""
    rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
    rf.fit(X, y)
    imp = pd.Series(rf.feature_importances_, index=X.columns).sort_va_
