<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-24/day24_feature_selection_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# day24_feature_selection_tuning.py
"""
Day 24: Feature selection, hyperparameter tuning, explainability.
Input: train_processed.csv (must have 'Survived' target) and test_processed.csv (has PassengerId or Id).
Output: day24_submission.csv, day24_model.joblib, day24_report.txt
"""

import pandas as pd
import numpy as np
import json
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.inspection import permutation_importance
from scipy.stats import randint, uniform

# ---------------------------
# Step 0: Files / Constants
# ---------------------------
TRAIN_FILE = "train_processed.csv"
TEST_FILE  = "test_processed.csv"
TARGET_COL = "Survived"   # from Day 23 workflow
REPORT_FILE = "day24_report.txt"
MODEL_FILE  = "day24_model.joblib"
SUBMISSION_FILE = "day24_submission.csv"
RANDOM_STATE = 42

# ---------------------------
# Step 1: Load data
# ---------------------------
train = pd.read_csv(TRAIN_FILE)
test  = pd.read_csv(TEST_FILE)

# detect id column in test
id_col = None
for c in ["PassengerId", "Id", "ID", "passengerid"]:
    if c in test.columns:
        id_col = c
        break

if TARGET_COL not in train.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in {TRAIN_FILE}.")

# Separate features and target
drop_cols = [TARGET_COL]
if id_col and id_col in train.columns:
    drop_cols.append(id_col)

X = train.drop(columns=drop_cols, errors='ignore')
y = train[TARGET_COL].copy()

# Keep a copy of raw test ids
if id_col:
    test_ids = test[id_col].copy()
    X_test_full = test.drop(columns=[id_col], errors='ignore')
else:
    test_ids = pd.Series(np.arange(len(test)), name="Id")
    X_test_full = test.copy()

# ---------------------------
# Step 2: detect numeric & categorical
# ---------------------------
# Separate boolean columns from other categorical features
boolean_features = X.select_dtypes(include=["bool"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object","category"]).columns.tolist()
numeric_features = X.select_dtypes(include=["int64","float64"]).columns.tolist()

# Convert boolean features to integers and add to numeric features
for col in boolean_features:
    X[col] = X[col].astype(int)
    X_test_full[col] = X_test_full[col].astype(int)
numeric_features.extend(boolean_features)


print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

# ---------------------------
# Step 3: preprocessing pipelines
# ---------------------------
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="drop"  # drop any unexpected columns
)

# ---------------------------
# Step 4: train/validation split
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

# ---------------------------
# Step 5: pipeline + param distributions for RandomizedSearchCV
# ---------------------------
base_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=RANDOM_STATE))
])

# param distributions (list-of-dicts to search multiple estimators)
param_distributions = [
    {
        "classifier": [RandomForestClassifier(random_state=RANDOM_STATE)],
        "classifier__n_estimators": randint(100, 501),   # 100..500
        "classifier__max_depth": [None, 5, 8, 12, 20],
        "classifier__min_samples_split": randint(2, 11),
        "classifier__min_samples_leaf": randint(1, 5),
        "classifier__max_features": ["sqrt", "log2", None]
    },
    {
        "classifier": [GradientBoostingClassifier(random_state=RANDOM_STATE)],
        "classifier__n_estimators": randint(100, 501),
        "classifier__learning_rate": uniform(0.01, 0.19),  # 0.01-0.2
        "classifier__max_depth": randint(3, 9),
        "classifier__subsample": [0.6, 0.8, 1.0]
    },
    {
        "classifier": [LogisticRegression(random_state=RANDOM_STATE, max_iter=2000)],
        "classifier__C": uniform(0.01, 10),
        "classifier__penalty": ["l2"],
        "classifier__solver": ["lbfgs"]
    }
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

search = RandomizedSearchCV(
    estimator=base_pipe,
    param_distributions=param_distributions,
    n_iter=30,                    # change if you want more/less exploration
    scoring="roc_auc",
    n_jobs=-1,
    cv=cv,
    verbose=2,
    random_state=RANDOM_STATE,
    return_train_score=False
)

print("Starting RandomizedSearchCV... (this may take a while depending on n_iter)")
search.fit(X_train, y_train)

best_model = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_

# ---------------------------
# Step 6: Evaluate on validation set
# ---------------------------
y_val_pred = best_model.predict(X_val)
if hasattr(best_model, "predict_proba"):
    try:
        y_val_proba = best_model.predict_proba(X_val)[:, 1]
    except Exception:
        y_val_proba = None
else:
    y_val_proba = None

acc = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_proba) if y_val_proba is not None else None
clf_report = classification_report(y_val, y_val_pred)
conf_mat = confusion_matrix(y_val, y_val_pred)

print("\nValidation results:")
print(f"Accuracy: {acc:.4f}")
if roc_auc is not None:
    print(f"ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:\n", clf_report)
print("\nConfusion Matrix:\n", conf_mat)

# ---------------------------
# Step 7: Permutation importance (explainability)
# ---------------------------
print("\nComputing permutation importances...")
perm = permutation_importance(best_model, X_val, y_val, n_repeats=12, random_state=RANDOM_STATE, n_jobs=-1)

# Build post-preprocessing feature names:
feature_names = []
if numeric_features:
    feature_names.extend(numeric_features)

if categorical_features:
    # get onehot names from the fitted OneHotEncoder
    try:
        ohe = best_model.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
        cat_ohe_names = list(ohe.get_feature_names_out(categorical_features))
        feature_names.extend(cat_ohe_names)
    except Exception:
        # fallback: generic names
        # Count produced columns:
        n_cat_cols = perm.importances_mean.shape[0] - len(numeric_features)
        cat_ohe_names = [f"cat_ohe_{i}" for i in range(n_cat_cols)]
        feature_names.extend(cat_ohe_names)

# If mismatch in length, just create generic names
if len(feature_names) != perm.importances_mean.shape[0]:
    feature_names = [f"f_{i}" for i in range(perm.importances_mean.shape[0])]

feat_imp_df = (
    pd.DataFrame({
        "feature": feature_names,
        "importance_mean": perm.importances_mean,
        "importance_std": perm.importances_std
    })
    .sort_values("importance_mean", ascending=False)
    .reset_index(drop=True)
)

print("\nTop features by permutation importance:")
print(feat_imp_df.head(15).to_string(index=False))

# ---------------------------
# Step 8: Save model, submission, report
# ---------------------------
joblib.dump(best_model, MODEL_FILE)
print(f"\nSaved best model to: {MODEL_FILE}")

# Create submission
print("Generating submission on test set...")
test_preds = best_model.predict(X_test_full)
submission = pd.DataFrame({id_col if id_col else "Id": test_ids, TARGET_COL: test_preds})
submission.to_csv(SUBMISSION_FILE, index=False)
print(f"Saved submission to: {SUBMISSION_FILE}")

# Save a short report
report = {
    "best_params": {k: str(v) for k, v in best_params.items()},
    "best_cv_score_roc_auc": float(best_score) if hasattr(best_score, "__float__") else str(best_score),
    "val_accuracy": float(acc),
    "val_roc_auc": float(roc_auc) if roc_auc is not None else None,
    "classification_report": clf_report,
    "top_features": feat_imp_df.head(20).to_dict(orient="records")
}

with open(REPORT_FILE, "w") as f:
    f.write(json.dumps(report, indent=2))
print(f"Saved report to: {REPORT_FILE}")

print("\nDone. Files generated:")
print("-", MODEL_FILE)
print("-", SUBMISSION_FILE)
print("-", REPORT_FILE)

Numeric features: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone', 'Fare_log', 'TicketGroupSize', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_U', 'AgeBin_Child', 'AgeBin_MidAge', 'AgeBin_Senior', 'AgeBin_Teen']
Categorical features: []
Starting RandomizedSearchCV... (this may take a while depending on n_iter)
Fitting 5 folds for each of 30 candidates, totalling 150 fits

Validation results:
Accuracy: 0.8156
ROC AUC: 0.8338

Classification Report:
               precision    recall  f1-score   support

         0.0       0.83      0.88      0.85       110
         1.0       0.79      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179


Confusion Matrix:
 [[97 13]
 [20 49]]

Computing permutation importances...