<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-16/day16_feature_quick.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# day16_feature_quick.py
# Day 16: quick, safe feature engineering + CV comparison (2-hour plan)

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
import datetime

# ------- CONFIG -------
DATA_DIR = Path("data/processed")
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_CSV = DATA_DIR / "train_processed.csv"
TEST_CSV = DATA_DIR / "test_processed.csv"
RANDOM_STATE = 42
CV_FOLDS = 5
# ----------------------

def add_features_if_missing(df):
    """Add commonly useful Titanic features only if they are NOT already present."""
    df = df.copy()
    # FamilySize
    if "FamilySize" not in df.columns and {"SibSp", "Parch"}.issubset(df.columns):
        df["FamilySize"] = df["SibSp"].fillna(0).astype(int) + df["Parch"].fillna(0).astype(int) + 1
    # IsAlone
    if "IsAlone" not in df.columns and "FamilySize" in df.columns:
        df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    # Title from Name
    if "Title" not in df.columns and "Name" in df.columns:
        df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.", expand=False).str.strip()
        df["Title"] = df["Title"].replace({
            "Mlle":"Miss","Ms":"Miss","Mme":"Mrs",
            "Lady":"Royal","Countess":"Royal","Sir":"Royal","Don":"Royal","Dona":"Royal",
            "Jonkheer":"Royal","Col":"Officer","Capt":"Officer","Major":"Officer"
        }).fillna("Other")
    # FarePerPerson
    if "FarePerPerson" not in df.columns and "Fare" in df.columns:
        if "FamilySize" in df.columns:
            df["FarePerPerson"] = df["Fare"].fillna(0) / df["FamilySize"].replace(0, 1)
        else:
            df["FarePerPerson"] = df["Fare"].fillna(0)
    # AgeBin
    if "AgeBin" not in df.columns and "Age" in df.columns:
        df["AgeBin"] = pd.cut(df["Age"].fillna(df["Age"].median()),
                              bins=[0,12,20,35,60,200], labels=False, include_lowest=True)
    # Ticket prefix
    if "TicketPrefix" not in df.columns and "Ticket" in df.columns:
        df["TicketPrefix"] = df["Ticket"].astype(str).str.replace(r'[\d\./]', '', regex=True).str.strip().str.split().str[0].replace("", "NO")
    return df

def encode_and_impute(X_train_df, X_test_df=None):
    """One-hot encode object columns, align train/test, then median-impute numeric NaNs."""
    # One-hot encode all object columns
    X_train_enc = pd.get_dummies(X_train_df, dummy_na=False)
    if X_test_df is not None:
        X_test_enc = pd.get_dummies(X_test_df, dummy_na=False)
        # Align columns (train cols -> test, fill missing with 0)
        X_train_enc, X_test_enc = X_train_enc.align(X_test_enc, join="left", axis=1, fill_value=0)
    else:
        X_test_enc = None

    imputer = SimpleImputer(strategy="median")
    X_train_imp = pd.DataFrame(imputer.fit_transform(X_train_enc), columns=X_train_enc.columns)
    if X_test_enc is not None:
        X_test_imp = pd.DataFrame(imputer.transform(X_test_enc), columns=X_test_enc.columns)
    else:
        X_test_imp = None

    return X_train_imp, X_test_imp

def cv_score(clf, X, y, folds=CV_FOLDS):
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=RANDOM_STATE)
    scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    return scores

def main():
    t0 = datetime.datetime.now()
    print(f"Day16 run started: {t0.isoformat()}\n")

    # Load
    train = pd.read_csv(TRAIN_CSV)
    test = pd.read_csv(TEST_CSV)

    # Ensure target exists
    if "Survived" not in train.columns:
        raise ValueError("train_processed.csv must contain 'Survived' column.")

    # --- Baseline (existing processed features) ---
    print("1) Baseline evaluation on existing processed features...")
    X_base = train.drop(["Survived"], axis=1, errors="ignore")
    y = train["Survived"]
    # If test has Survived col (NaN), drop it
    test_for_align = test.drop(["Survived"], axis=1, errors="ignore")

    X_base_enc, _ = encode_and_impute(X_base, test_for_align)
    clf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
    base_scores = cv_score(clf, X_base_enc, y)
    print(f"Baseline CV acc (mean): {base_scores.mean():.4f}  std: {base_scores.std():.4f}")
    print("Baseline fold scores:", np.round(base_scores, 4))

    # --- Add features (only if missing) ---
    print("\n2) Adding features (if missing) to train/test...")
    train_fe = add_features_if_missing(train)
    test_fe = add_features_if_missing(test)

    # Drop 'Survived' from test (if present)
    test_fe = test_fe.drop(["Survived"], axis=1, errors="ignore")

    # --- Encode + Impute after FE ---
    X_fe = train_fe.drop(["Survived"], axis=1, errors="ignore")
    X_fe_enc, X_test_enc = encode_and_impute(X_fe, test_fe)

    # Re-run CV with same classifier (keeps it comparable)
    fe_scores = cv_score(clf, X_fe_enc, y)
    print(f"\nAfter FE CV acc (mean): {fe_scores.mean():.4f}  std: {fe_scores.std():.4f}")
    print("After FE fold scores:", np.round(fe_scores, 4))
    print(f"\nDelta (FE - baseline): {fe_scores.mean() - base_scores.mean():.4f}")

    # Train on full FE data to inspect feature importances quickly
    print("\n3) Training RF on full FE data to show top features (quick check)...")
    clf_full = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)
    clf_full.fit(X_fe_enc, y)
    importances = pd.Series(clf_full.feature_importances_, index=X_fe_enc.columns).sort_values(ascending=False)
    print("Top 10 features by importance:\n", importances.head(10))

    # Save FE'd CSVs (optional) and a short result log
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    train_fe.to_csv(OUT_DIR / f"train_processed_fe_{timestamp}.csv", index=False)
    test_fe.to_csv(OUT_DIR / f"test_processed_fe_{timestamp}.csv", index=False)

    summary = {
        "time": timestamp,
        "baseline_cv_mean": float(base_scores.mean()),
        "fe_cv_mean": float(fe_scores.mean()),
        "delta": float((fe_scores.mean() - base_scores.mean()))
    }
    pd.Series(summary).to_frame("value").to_csv(OUT_DIR / f"day16_summary_{timestamp}.csv")

    t1 = datetime.datetime.now()
    print(f"\nDone. Elapsed: {t1 - t0}")
    print(f"Saved FE files + summary in: {OUT_DIR.resolve()}")

if __name__ == "__main__":
    main()


Day16 run started: 2025-09-23T14:01:17.124712

1) Baseline evaluation on existing processed features...
Baseline CV acc (mean): 0.8294  std: 0.0196
Baseline fold scores: [0.8603 0.8315 0.8146 0.8034 0.8371]

2) Adding features (if missing) to train/test...

After FE CV acc (mean): 0.8305  std: 0.0151
After FE fold scores: [0.8603 0.8258 0.8258 0.8202 0.8202]

Delta (FE - baseline): 0.0011

3) Training RF on full FE data to show top features (quick check)...
Top 10 features by importance:
 PassengerId        0.119967
Title_Mr           0.113627
Sex_male           0.107291
Age                0.092962
FarePerPerson      0.084676
Fare               0.082743
Fare_log           0.081013
Pclass             0.038915
TicketGroupSize    0.037655
FamilySize         0.030356
dtype: float64

Done. Elapsed: 0:00:09.318970
Saved FE files + summary in: /content/outputs
