In [17]:
# ===============================================================
# 03_model_tuning.ipynb
# Purpose: Hyperparameter tuning and cross-validation
# ===============================================================
print("Notebook ready ✅")

Notebook ready ✅


In [18]:
# safe to re-run
%pip install -q ipywidgets tqdm tqdm-joblib

Note: you may need to restart the kernel to use updated packages.


In [19]:
# ================================================================
# 03_model_tuning.ipynb
# Purpose: Hyperparameter tuning (CV) for Savings regression
# ================================================================

# Core imports
import numpy as np
import pandas as pd
import time
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

# Sklearn
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

# Distributions
from scipy.stats import loguniform, randint

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Cross-validation setup
CV_FOLDS = KFold(n_splits=5, shuffle=True, random_state=SEED)

# Explicit RMSE scorer (safe across sklearn versions)
rmse_scorer = make_scorer(
    lambda yt, yp: -float(np.sqrt(mean_squared_error(yt, yp))),
    greater_is_better=True
)

def eval_regression(est, X_test, y_test):
    """Compute evaluation metrics for a fitted regression estimator."""
    y_pred = est.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = float(np.mean((y_test - y_pred) ** 2))
    rmse = float(np.sqrt(mse))
    r2 = r2_score(y_test, y_pred)
    return mae, mse, rmse, r2

In [20]:
# ================================================================
# Load processed dataset
# ================================================================
DATA_PATH = "../data/processed/transactions_long.csv"
df = pd.read_csv(DATA_PATH)

target_col = "Desired_Savings"
drop_cols = ["entity_id", "category", "Occupation", "City_Tier", target_col]
num_cols = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore").select_dtypes(include=[np.number]).columns.tolist()

X_savings = df[num_cols].copy()
y_savings = df[target_col].astype(float).copy()

# Train/test split
X_savings_train, X_savings_test, y_savings_train, y_savings_test = train_test_split(
    X_savings, y_savings, test_size=0.2, random_state=SEED
)

print("Savings shapes:", X_savings_train.shape, X_savings_test.shape)

Savings shapes: (124751, 17) (31188, 17)


In [21]:
# === Tuning setup (run once, above 5A) ===
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import loguniform, randint

# Repro + CV config
SEED      = 42
CV_FOLDS  = 5            # k-folds
N_ITER_REG = 24          # param samples per model (adjust to speed up/slow down)
N_JOBS     = -1          # use all cores

# Pipelines / estimators
svr_pipe = make_pipeline(
    # scale for SVR
    StandardScaler(),
    LinearSVR(dual=True,  # default True; good when n_samples > n_features
              tol=1e-3,
              max_iter=5000,
              random_state=SEED)
)

rf_reg = RandomForestRegressor(
    random_state=SEED,
    n_jobs=N_JOBS
)

# Parameter distributions for RandomizedSearchCV
# (SVR): search C and epsilon on log scales; keep loss="epsilon_insensitive" default
svr_dist = {
    "linearsvr__C":       loguniform(1e-2, 1e2),
    "linearsvr__epsilon": loguniform(1e-3, 1.0),
}

# (RandomForest): moderate ranges to keep runtime reasonable
rf_dist = {
    "n_estimators":   randint(50, 200),
    "max_depth":      randint(6, 18),
    "min_samples_split": randint(2, 8),
    "min_samples_leaf":  randint(1, 6),
    "max_features":      ["sqrt", "log2"],
    "bootstrap":         [True],
}

In [22]:
# === 5A. Run grid/random searches (REGRESSION: Savings) ======================
# Fast knobs + progress bar CV search for SVR and RandomForest

import time
import numpy as np
import pandas as pd

from copy import deepcopy
from tqdm.auto import tqdm

from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    KFold, cross_val_score, ParameterSampler
)

# ---------- tuning knobs (fast) ----------
SEED       = 42
N_JOBS     = -1
CV_FOLDS   = 3         # 3-fold during tuning; keep 5-fold for final checks
N_ITER_REG = 12        # #param samples per model (fast but useful)

rng = np.random.RandomState(SEED)
cv  = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=SEED)
scoring = 'neg_root_mean_squared_error'  # RMSE (negative)

# ---------- optional: subsample features for tuning only ----------
def maybe_subsample(X, y, frac=0.35, random_state=SEED):
    n = int(len(y) * frac)
    idx = rng.choice(len(y), n, replace=False)
    return X.iloc[idx] if hasattr(X, "iloc") else X[idx], y.iloc[idx] if hasattr(y, "iloc") else y[idx]

X_savings_train_t, y_savings_train_t = maybe_subsample(X_savings_train, y_savings_train, frac=0.35, random_state=SEED)

# ---------- estimators + distributions (narrower = faster & stabler) ----------
svr_pipe = make_pipeline(
    StandardScaler(),
    LinearSVR(dual=True, tol=1e-3, max_iter=5000, random_state=SEED)
)
svr_dist = {
    "linearsvr__C":       np.exp(rng.uniform(np.log(1e-2), np.log(1e2), size=N_ITER_REG*2)),  # pre-sampled log-uniform
    "linearsvr__epsilon": np.exp(rng.uniform(np.log(5e-2), np.log(5e-1), size=N_ITER_REG*2)),
}

rf_reg = RandomForestRegressor(
    n_jobs=N_JOBS,
    random_state=SEED
)
rf_dist = {
    "n_estimators":      rng.randint(60, 141, size=N_ITER_REG*2),  # 60–140
    "max_depth":         rng.randint(6, 15,  size=N_ITER_REG*2),   # 6–14
    "min_samples_split": rng.randint(2, 7,   size=N_ITER_REG*2),   # 2–6
    "min_samples_leaf":  rng.randint(1, 5,   size=N_ITER_REG*2),   # 1–4
    "max_features":      ["sqrt"],                                  # drop "log2" to prune space
    "bootstrap":         [True],
}

def param_sampler(dist, n_iter, rng):
    """
    Turn our arrays/lists into a ParameterSampler-friendly dict.
    If a value is an array/list, sample uniformly; if a scalar, use as-is.
    """
    space = {}
    for k, v in dist.items():
        if isinstance(v, (list, np.ndarray)):
            space[k] = v
        else:
            space[k] = [v]
    return ParameterSampler(space, n_iter=n_iter, random_state=rng)

def manual_cv_search(label, base_est, dist, X, y, n_iter=N_ITER_REG):
    """
    Simple manual CV search with a tqdm progress bar.
    Updates progress by CV_FOLDS per parameter set (so total = n_iter*CV_FOLDS).
    Returns: best_estimator, best_cv_rmse, best_params, fit_time
    """
    sampler = list(param_sampler(dist, n_iter, rng))
    pbar = tqdm(total=len(sampler)*CV_FOLDS, desc=f"{label} tuning", unit="fits")
    best_rmse = np.inf
    best_params = None

    for params in sampler:
        est = clone(base_est).set_params(**params)
        # run k-fold CV (note: we update by CV_FOLDS for this set)
        scores = cross_val_score(est, X, y, scoring=scoring, cv=cv, n_jobs=N_JOBS)
        rmse = -np.mean(scores)
        pbar.update(CV_FOLDS)

        if rmse < best_rmse:
            best_rmse   = rmse
            best_params = deepcopy(params)

    pbar.close()

    # Fit best on *full* (subsampled) training
    best = clone(base_est).set_params(**best_params)
    t0 = time.perf_counter()
    best.fit(X, y)
    fit_time = time.perf_counter() - t0

    print(f"Best CV (neg RMSE): {-best_rmse:.4f} | Params: {best_params}")
    print(f"Fit time: {fit_time:.2f}s")
    return best, -best_rmse, best_params, fit_time  # return positive score for readability

# ---------- run searches ----------
print("\n► Running CV for regression: SVR (Savings)")
best_svr, best_svr_score, best_svr_params, _ = manual_cv_search(
    "SVR (Savings)", svr_pipe, svr_dist, X_savings_train_t, y_savings_train_t, n_iter=N_ITER_REG
)

print("\n► Running CV for regression: RandomForest (Savings)")
best_rf, best_rf_score, best_rf_params, _ = manual_cv_search(
    "RandomForest (Savings)", rf_reg, rf_dist, X_savings_train_t, y_savings_train_t, n_iter=N_ITER_REG*5  # a bit more budget
)

# ---------- evaluate on the held-out test set (full, not subsampled) ----------
def evaluate_best(name, est, X_test, y_test):
    from math import sqrt
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    y_pred = est.predict(X_test)
    mae  = mean_absolute_error(y_test, y_pred)
    mse  = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2   = r2_score(y_test, y_pred)
    return {"Model": name, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}

rows = []
rows.append(evaluate_best("SVR (best)",         best_svr, X_savings_test, y_savings_test))
rows.append(evaluate_best("Random Forest (best)", best_rf, X_savings_test, y_savings_test))

reg_results_df = pd.DataFrame(rows).sort_values("RMSE")
print("\n=== Savings → Regression (test set) ===")
display(reg_results_df)

# keep for later summary cell if needed
best_svr_savings     = best_svr
best_svr_savings_cv  = best_svr_score
best_rf_savings      = best_rf
best_rf_savings_cv   = best_rf_score


► Running CV for regression: SVR (Savings)


SVR (Savings) tuning:   0%|          | 0/36 [00:00<?, ?fits/s]

Best CV (neg RMSE): -2868.0748 | Params: {'linearsvr__epsilon': np.float64(0.3897685379286406), 'linearsvr__C': np.float64(36.06389385521764)}
Fit time: 0.06s

► Running CV for regression: RandomForest (Savings)


RandomForest (Savings) tuning:   0%|          | 0/180 [00:00<?, ?fits/s]

Best CV (neg RMSE): -666.4141 | Params: {'n_estimators': np.int64(110), 'min_samples_split': np.int64(2), 'min_samples_leaf': np.int64(1), 'max_features': 'sqrt', 'max_depth': np.int64(14), 'bootstrap': True}
Fit time: 8.92s

=== Savings → Regression (test set) ===


Unnamed: 0,Model,MAE,MSE,RMSE,R2
1,Random Forest (best),146.344882,293148.1,541.431512,0.995489
0,SVR (best),1043.81158,10040050.0,3168.60336,0.845506
