<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-10/day10_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# day10_feature_engineering.py
# Run from project root: python day10/day10_feature_engineering.py
# Outputs: day10/day10_titanic_feat.csv, day10/assets/preprocessor.joblib, day10/assets/te_maps.json

import json
from pathlib import Path
import numpy as np
import pandas as pd

# sklearn
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

OUT_DIR = Path("day10"); OUT_DIR.mkdir(exist_ok=True)
ASSETS = OUT_DIR / "assets"; ASSETS.mkdir(exist_ok=True)

# -------------------------
# Helpers: safe load
# -------------------------
candidates = [
    Path("day05/day05_titanic_feat.csv"),
    Path("day05_titanic_feat.csv"),
    Path("day02/day02_titanic_preserved.csv"),
    Path("day02/day02_titanic_clean.csv"),
    Path("train.csv")
]
data_path = None
for p in candidates:
    if p.exists():
        data_path = p
        break
if data_path is None:
    raise FileNotFoundError("No input CSV found. Place day05/day05_titanic_feat.csv or day02/day02_titanic_clean.csv or train.csv in project root.")

print("Loading:", data_path)
df_raw = pd.read_csv(data_path)

# -------------------------
# 1) Basic feature engineering (create features in-place)
# -------------------------
df = df_raw.copy()

# FamilySize and IsAlone
if 'SibSp' in df.columns and 'Parch' in df.columns:
    df['FamilySize'] = df['SibSp'].fillna(0).astype(int) + df['Parch'].fillna(0).astype(int) + 1
else:
    df['FamilySize'] = df.get('FamilySize', 1)

df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Title from Name (if exists)
if 'Name' in df.columns:
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.').str.strip()
    df['Title'] = df['Title'].replace({
        "Mlle":"Miss","Ms":"Miss","Mme":"Mrs",
        "Countess":"Rare","Lady":"Rare","Sir":"Rare","Don":"Rare","Dona":"Rare",
        "Col":"Rare","Major":"Rare","Capt":"Rare","Rev":"Rare","Dr":"Rare","Jonkheer":"Rare"
    })
    # Collapse very rare titles to 'Rare' (train later handles exact mapping)
    vc = df['Title'].value_counts()
    rare_titles = vc[vc < 10].index.tolist()
    df.loc[df['Title'].isin(rare_titles), 'Title'] = 'Rare'
else:
    df['Title'] = 'Unknown'

# Deck from Cabin (first letter), missing->'U'
if 'Cabin' in df.columns:
    df['Cabin'] = df['Cabin'].fillna('U')
    df['Deck'] = df['Cabin'].astype(str).str[0]
else:
    df['Deck'] = 'U'

# Fare per person
if 'Fare' in df.columns:
    df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    df['Fare_log'] = np.log1p(df['Fare'])
else:
    df['FarePerPerson'] = 0
    df['Fare_log'] = 0

# Age imputation by Title median (we will compute mapping from train only below)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Ticket group size (group by Ticket) - useful signal
if 'Ticket' in df.columns:
    ticket_counts = df['Ticket'].fillna('NA').map(df['Ticket'].value_counts())
    df['TicketGroupSize'] = ticket_counts
else:
    df['TicketGroupSize'] = 1

# Age bins
df['AgeBin'] = pd.cut(df['Age'].fillna(df['Age'].median()), bins=[0,12,20,40,60,120],
                      labels=['Child','Teen','Adult','MidAge','Senior'])

# -------------------------
# 2) Train/test split (so we can fit encoders on train only)
# -------------------------
if 'Survived' in df.columns:
    X_full = df.drop(columns=[])
    y_full = df['Survived']
    X_train, X_hold, y_train, y_hold = train_test_split(X_full, y_full, test_size=0.15, stratify=y_full, random_state=42)
    train_idx = X_train.index
    print("Train rows:", X_train.shape[0], "Holdout rows:", X_hold.shape[0])
else:
    # no label (rare) — we will treat all as train (useful if user already merged)
    X_train = df.copy()
    y_train = None
    X_hold = None

# -------------------------
# 3) Age imputation by Title (compute mapping on TRAIN only)
# -------------------------
if 'Age' in X_train.columns:
    age_map = X_train.groupby('Title')['Age'].median().to_dict()
    # Fill train ages using title median where missing
    X_train['Age'] = X_train.apply(lambda r: age_map[r['Title']] if pd.isna(r['Age']) and r['Title'] in age_map else r['Age'], axis=1)
    # For holdout/test and full df, apply mapping with fallback to global median
    global_age_med = X_train['Age'].median()
    df['Age'] = df.apply(lambda r: age_map.get(r['Title'], global_age_med) if pd.isna(r['Age']) else r['Age'], axis=1)
else:
    age_map = {}
    global_age_med = None

# Save age map for reproducibility
with open(ASSETS / "age_map.json", "w") as f:
    json.dump(age_map, f)

# -------------------------
# 4) K-Fold Target Encoding (safe) for 'Title' (example)
# -------------------------
# Only run if we have labels in train
def kfold_target_encode(train_series, target_series, test_series=None, n_splits=5, seed=42):
    """
    Returns (train_encoded_series, test_encoded_series, mapping)
    train_encoded_series: out-of-fold target-mean encoding for train
    test_encoded_series: mapping applied to test (if provided) filled with global mean
    mapping: full-train mapping of category -> mean
    """
    tr = train_series.reset_index(drop=True)
    y = target_series.reset_index(drop=True)
    oof = pd.Series(index=tr.index, dtype=float)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for train_idx, val_idx in skf.split(np.zeros(len(tr)), y):
        # compute mapping on train fold
        fold_means = pd.DataFrame({'cat': tr.iloc[train_idx], 'y': y.iloc[train_idx]}).groupby('cat')['y'].mean()
        # map to validation fold
        oof.iloc[val_idx] = tr.iloc[val_idx].map(fold_means)
    global_mean = y.mean()
    oof.fillna(global_mean, inplace=True)
    mapping_full = pd.DataFrame({'cat': tr, 'y': y}).groupby('cat')['y'].mean().to_dict()
    if test_series is not None:
        test_enc = test_series.map(mapping_full).fillna(global_mean)
    else:
        test_enc = None
    return oof, test_enc, mapping_full

# Apply to Title (if labels exist)
te_maps = {}
if 'Survived' in df.columns:
    tr_series = X_train['Title'].astype(str)
    y_tr = y_train.reset_index(drop=True)
    # test_series: apply mapping to entire df Title col (for final processed CSV)
    test_series = df['Title'].astype(str)
    train_te, test_te, mapping_full = kfold_target_encode(tr_series, y_tr, test_series=test_series, n_splits=5)
    # store encoded values back
    # For train rows -> set encoded values
    df.loc[X_train.index, 'Title_te'] = train_te.values
    # For rest -> set test encoded (mapping from full train)
    df.loc[~df.index.isin(X_train.index), 'Title_te'] = test_te.loc[~df.index.isin(X_train.index)].values
    te_maps['Title'] = mapping_full
else:
    # no labels -> naive mean encoding from whole df (not ideal but unavoidable)
    te_maps['Title'] = df.groupby('Title')['Survived'].mean().to_dict() if 'Survived' in df.columns else {}

# Save target encoding maps
with open(ASSETS / "te_maps.json", "w") as f:
    json.dump({k: {str(cat): float(v) for cat, v in m.items()} for k, m in te_maps.items()}, f, indent=2)

# -------------------------
# 5) Build ColumnTransformer pipeline (numerics + categorical OHE)
# -------------------------
# Choose columns
num_cols = [c for c in ['Age','Fare','FarePerPerson','TicketGroupSize','FamilySize','Title_te'] if c in df.columns]
cat_cols = [c for c in ['Pclass','Sex','Embarked','Deck','AgeBin'] if c in df.columns]

from sklearn.pipeline import make_pipeline

num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols),
], remainder='drop', verbose_feature_names_out=False)

# Fit preprocessor on TRAIN (to avoid leakage)
# If we have labels, fit on X_train rows; otherwise fit on full df
if 'Survived' in df.columns:
    preprocessor.fit(df.loc[X_train.index, num_cols + cat_cols])
else:
    preprocessor.fit(df[num_cols + cat_cols])

# Transform entire df to get final numeric matrix
X_proc = preprocessor.transform(df[num_cols + cat_cols])
feature_names_num = num_cols
try:
    ohe_cols = preprocessor.named_transformers_['cat'].named_steps['ohe'].get_feature_names_out(cat_cols)
    feature_names = list(feature_names_num) + list(ohe_cols)
except Exception:
    feature_names = feature_names_num

X_proc_df = pd.DataFrame(X_proc, columns=feature_names, index=df.index)

# Add target back if present
if 'Survived' in df.columns:
    out_df = pd.concat([X_proc_df, df['Survived'].reset_index(drop=True)], axis=1)
else:
    out_df = X_proc_df.copy()

# -------------------------
# 6) Save processed CSV & preprocessor
# -------------------------
out_path = OUT_DIR / "day10_titanic_feat.csv"
out_df.to_csv(out_path, index=False)
joblib.dump(preprocessor, ASSETS / "preprocessor.joblib")
print("Saved processed features to:", out_path)
print("Saved preprocessor to:", ASSETS / "preprocessor.joblib")

Loading: day02/day02_titanic_clean.csv
Train rows: 757 Holdout rows: 134
Saved processed features to: day10/day10_titanic_feat.csv
Saved preprocessor to: day10/assets/preprocessor.joblib


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)


In [None]:
from google.colab import drive
drive.mount('/content/drive')