In [1]:
# ===============================================================
# 03_model_tuning.ipynb
# Purpose: Hyperparameter tuning and cross-validation
# ===============================================================
print("Notebook ready ✅")

Notebook ready ✅


In [2]:
# safe to re-run
%pip install -q ipywidgets tqdm tqdm-joblib

Note: you may need to restart the kernel to use updated packages.


In [3]:
# ================================================================
# 03_model_tuning.ipynb
# Purpose: Hyperparameter tuning (CV) for Savings regression
# ================================================================

# Core imports
import numpy as np
import pandas as pd
import time
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

# Sklearn
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    make_scorer, mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

# Distributions
from scipy.stats import loguniform, randint

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Cross-validation setup
CV_FOLDS = KFold(n_splits=5, shuffle=True, random_state=SEED)

# Explicit RMSE scorer (safe across sklearn versions)
rmse_scorer = make_scorer(
    lambda yt, yp: -float(np.sqrt(mean_squared_error(yt, yp))),
    greater_is_better=True
)

def eval_regression(est, X_test, y_test):
    """Compute evaluation metrics for a fitted regression estimator."""
    y_pred = est.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = float(np.mean((y_test - y_pred) ** 2))
    rmse = float(np.sqrt(mse))
    r2 = r2_score(y_test, y_pred)
    return mae, mse, rmse, r2

In [4]:
# ================================================================
# Load processed dataset
# ================================================================
DATA_PATH = "../data/processed/transactions_long.csv"
df = pd.read_csv(DATA_PATH)

target_col = "Desired_Savings"
drop_cols = ["entity_id", "category", "Occupation", "City_Tier", target_col]
num_cols = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore").select_dtypes(include=[np.number]).columns.tolist()

X_savings = df[num_cols].copy()
y_savings = df[target_col].astype(float).copy()

# Train/test split
X_savings_train, X_savings_test, y_savings_train, y_savings_test = train_test_split(
    X_savings, y_savings, test_size=0.2, random_state=SEED
)

print("Savings shapes:", X_savings_train.shape, X_savings_test.shape)

Savings shapes: (124751, 17) (31188, 17)


In [5]:
# === Tuning setup (run once, above 5A) ===
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import loguniform, randint

# Repro + CV config
SEED      = 42
CV_FOLDS  = 5            # k-folds
N_ITER_REG = 24          # param samples per model (adjust to speed up/slow down)
N_JOBS     = -1          # use all cores

# Pipelines / estimators
svr_pipe = make_pipeline(
    # scale for SVR
    StandardScaler(),
    LinearSVR(dual=True,  # default True; good when n_samples > n_features
              tol=1e-3,
              max_iter=5000,
              random_state=SEED)
)

rf_reg = RandomForestRegressor(
    random_state=SEED,
    n_jobs=N_JOBS
)

# Parameter distributions for RandomizedSearchCV
# (SVR): search C and epsilon on log scales; keep loss="epsilon_insensitive" default
svr_dist = {
    "linearsvr__C":       loguniform(1e-2, 1e2),
    "linearsvr__epsilon": loguniform(1e-3, 1.0),
}

# (RandomForest): moderate ranges to keep runtime reasonable
rf_dist = {
    "n_estimators":   randint(50, 200),
    "max_depth":      randint(6, 18),
    "min_samples_split": randint(2, 8),
    "min_samples_leaf":  randint(1, 6),
    "max_features":      ["sqrt", "log2"],
    "bootstrap":         [True],
}

In [6]:
# === 5A. Run grid/random searches (REGRESSION: Savings) ======================
# Fast knobs + progress bar CV search for SVR and RandomForest

import time
import numpy as np
import pandas as pd

from copy import deepcopy
from tqdm.auto import tqdm

from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
    KFold, cross_val_score, ParameterSampler
)

# ---------- tuning knobs (fast) ----------
SEED       = 42
N_JOBS     = -1
CV_FOLDS   = 3         # 3-fold during tuning; keep 5-fold for final checks
N_ITER_REG = 12        # #param samples per model (fast but useful)

rng = np.random.RandomState(SEED)
cv  = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=SEED)
scoring = 'neg_root_mean_squared_error'  # RMSE (negative)

# ---------- subsample features for tuning only ----------
def maybe_subsample(X, y, frac=0.35, random_state=SEED):
    n = int(len(y) * frac)
    idx = rng.choice(len(y), n, replace=False)
    return X.iloc[idx] if hasattr(X, "iloc") else X[idx], y.iloc[idx] if hasattr(y, "iloc") else y[idx]

X_savings_train_t, y_savings_train_t = maybe_subsample(X_savings_train, y_savings_train, frac=0.35, random_state=SEED)

# ---------- estimators + distributions (narrower = faster & stabler) ----------
svr_pipe = make_pipeline(
    StandardScaler(),
    LinearSVR(dual=True, tol=1e-3, max_iter=5000, random_state=SEED)
)
svr_dist = {
    "linearsvr__C":       np.exp(rng.uniform(np.log(1e-2), np.log(1e2), size=N_ITER_REG*2)),  # pre-sampled log-uniform
    "linearsvr__epsilon": np.exp(rng.uniform(np.log(5e-2), np.log(5e-1), size=N_ITER_REG*2)),
}

rf_reg = RandomForestRegressor(
    n_jobs=N_JOBS,
    random_state=SEED
)
rf_dist = {
    "n_estimators":      rng.randint(60, 141, size=N_ITER_REG*2),  # 60–140
    "max_depth":         rng.randint(6, 15,  size=N_ITER_REG*2),   # 6–14
    "min_samples_split": rng.randint(2, 7,   size=N_ITER_REG*2),   # 2–6
    "min_samples_leaf":  rng.randint(1, 5,   size=N_ITER_REG*2),   # 1–4
    "max_features":      ["sqrt"],                                  # drop "log2" to prune space
    "bootstrap":         [True],
}

def param_sampler(dist, n_iter, rng):
    """
    Turn our arrays/lists into a ParameterSampler-friendly dict.
    If a value is an array/list, sample uniformly; if a scalar, use as-is.
    """
    space = {}
    for k, v in dist.items():
        if isinstance(v, (list, np.ndarray)):
            space[k] = v
        else:
            space[k] = [v]
    return ParameterSampler(space, n_iter=n_iter, random_state=rng)

def manual_cv_search(label, base_est, dist, X, y, n_iter=N_ITER_REG):
    """
    Simple manual CV search with a tqdm progress bar.
    Updates progress by CV_FOLDS per parameter set (so total = n_iter*CV_FOLDS).
    Returns: best_estimator, best_cv_rmse, best_params, fit_time
    """
    sampler = list(param_sampler(dist, n_iter, rng))
    pbar = tqdm(total=len(sampler)*CV_FOLDS, desc=f"{label} tuning", unit="fits")
    best_rmse = np.inf
    best_params = None

    for params in sampler:
        est = clone(base_est).set_params(**params)
        # run k-fold CV (note: we update by CV_FOLDS for this set)
        scores = cross_val_score(est, X, y, scoring=scoring, cv=cv, n_jobs=N_JOBS)
        rmse = -np.mean(scores)
        pbar.update(CV_FOLDS)

        if rmse < best_rmse:
            best_rmse   = rmse
            best_params = deepcopy(params)

    pbar.close()

    # Fit best on *full* (subsampled) training
    best = clone(base_est).set_params(**best_params)
    t0 = time.perf_counter()
    best.fit(X, y)
    fit_time = time.perf_counter() - t0

    print(f"Best CV (neg RMSE): {-best_rmse:.4f} | Params: {best_params}")
    print(f"Fit time: {fit_time:.2f}s")
    return best, -best_rmse, best_params, fit_time  # return positive score for readability

# ---------- run searches ----------
print("\n► Running CV for regression: SVR (Savings)")
best_svr, best_svr_score, best_svr_params, _ = manual_cv_search(
    "SVR (Savings)", svr_pipe, svr_dist, X_savings_train_t, y_savings_train_t, n_iter=N_ITER_REG
)

print("\n► Running CV for regression: RandomForest (Savings)")
best_rf, best_rf_score, best_rf_params, _ = manual_cv_search(
    "RandomForest (Savings)", rf_reg, rf_dist, X_savings_train_t, y_savings_train_t, n_iter=N_ITER_REG*5  # a bit more budget
)

# ---------- evaluate on the held-out test set (full, not subsampled) ----------
def evaluate_best(name, est, X_test, y_test):
    from math import sqrt
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    y_pred = est.predict(X_test)
    mae  = mean_absolute_error(y_test, y_pred)
    mse  = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2   = r2_score(y_test, y_pred)
    return {"Model": name, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}

rows = []
rows.append(evaluate_best("SVR (best)",         best_svr, X_savings_test, y_savings_test))
rows.append(evaluate_best("Random Forest (best)", best_rf, X_savings_test, y_savings_test))

reg_results_df = pd.DataFrame(rows).sort_values("RMSE")
print("\n=== Savings → Regression (test set) ===")
display(reg_results_df)

# keep for later summary cell if needed
best_svr_savings     = best_svr
best_svr_savings_cv  = best_svr_score
best_rf_savings      = best_rf
best_rf_savings_cv   = best_rf_score


► Running CV for regression: SVR (Savings)


SVR (Savings) tuning:   0%|          | 0/36 [00:00<?, ?fits/s]

Best CV (neg RMSE): -2868.0748 | Params: {'linearsvr__epsilon': np.float64(0.3897685379286406), 'linearsvr__C': np.float64(36.06389385521764)}
Fit time: 0.06s

► Running CV for regression: RandomForest (Savings)


RandomForest (Savings) tuning:   0%|          | 0/180 [00:00<?, ?fits/s]

Best CV (neg RMSE): -666.4141 | Params: {'n_estimators': np.int64(110), 'min_samples_split': np.int64(2), 'min_samples_leaf': np.int64(1), 'max_features': 'sqrt', 'max_depth': np.int64(14), 'bootstrap': True}
Fit time: 8.81s

=== Savings → Regression (test set) ===


Unnamed: 0,Model,MAE,MSE,RMSE,R2
1,Random Forest (best),146.344882,293148.1,541.431512,0.995489
0,SVR (best),1043.81158,10040050.0,3168.60336,0.845506


In [7]:
# ================================================================
# 5B. Classification Model Tuning: Expense Category Prediction
#     Build and tune classifiers to predict spending category
# ================================================================

import time
import numpy as np
import pandas as pd
from copy import deepcopy
from tqdm.auto import tqdm

from sklearn.base import clone
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)

print("\n" + "="*70)
print("CLASSIFICATION: Expense Category Prediction (Multi-class)")
print("="*70)

# ---------- Prepare classification dataset ----------
# Load original data if not already loaded
if "df" not in globals():
    DATA_PATH = "../data/processed/transactions_long.csv"
    df = pd.read_csv(DATA_PATH)

TARGET_CAT = "category"
ID_COLS = {"entity_id", "txn_id", "index", "id"}
drop_for_clf = [c for c in df.columns if c in ID_COLS or ("category_encoded" in c)]

# Extract features and target
X_category = df.drop(columns=drop_for_clf, errors="ignore").select_dtypes(include=[np.number]).copy()
y_category_raw = df[TARGET_CAT].copy()

# Encode category labels
le_cat = LabelEncoder()
y_category = le_cat.fit_transform(y_category_raw)

print(f"\nDataset for category classification:")
print(f"  Features (X): {X_category.shape}")
print(f"  Target classes: {len(le_cat.classes_)} → {list(le_cat.classes_)}")
print(f"  Class distribution:\n{pd.Series(y_category, index=y_category_raw.index).value_counts().sort_index()}\n")

# ---------- Train/test split with stratification ----------
from sklearn.model_selection import train_test_split

X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(
    X_category, y_category, test_size=0.2, random_state=SEED, stratify=y_category
)

print(f"Train shape: {X_cat_train.shape} | Test shape: {X_cat_test.shape}")

# ---------- Scale features for fair comparison ----------
scaler_cat = StandardScaler()
scaler_cat.fit(X_cat_train)
X_cat_train_scaled = scaler_cat.transform(X_cat_train)
X_cat_test_scaled = scaler_cat.transform(X_cat_test)

# ---------- Tuning configuration ----------
CV_FOLDS_CAT = 3          # 3-fold CV during tuning (fast)
N_ITER_CLF = 15           # parameter samples per classifier
N_JOBS = -1

rng_cat = np.random.RandomState(SEED)
cv_cat = StratifiedKFold(n_splits=CV_FOLDS_CAT, shuffle=True, random_state=SEED)
scoring_clf = 'f1_macro'  # macro F1 for imbalanced multi-class

# ---------- Estimators + parameter distributions ----------

# 1. Logistic Regression (multinomial)
log_clf = LogisticRegression(
    max_iter=1000, multi_class='multinomial', 
    random_state=SEED, n_jobs=N_JOBS
)
log_dist = {
    "C": np.exp(rng_cat.uniform(np.log(1e-2), np.log(1e2), size=N_ITER_CLF)),
    "solver": ["lbfgs"],
    "class_weight": [None, "balanced"],
}

# 2. Linear SVC (one-vs-rest)
svc_clf = LinearSVC(
    dual=True, max_iter=2000, tol=1e-3,
    random_state=SEED, class_weight='balanced'
)
svc_dist = {
    "C": np.exp(rng_cat.uniform(np.log(1e-2), np.log(1e2), size=N_ITER_CLF)),
}

# 3. Random Forest (multi-class by default)
rf_clf = RandomForestClassifier(
    random_state=SEED, n_jobs=N_JOBS
)
rf_dist = {
    "n_estimators": rng_cat.randint(50, 151, size=N_ITER_CLF),
    "max_depth": rng_cat.randint(6, 16, size=N_ITER_CLF),
    "min_samples_split": rng_cat.randint(2, 8, size=N_ITER_CLF),
    "min_samples_leaf": rng_cat.randint(1, 5, size=N_ITER_CLF),
    "max_features": ["sqrt"],
}

# ---------- Helper: Manual CV search with progress bar ----------
def param_sampler_helper(dist, n_iter, rng):
    """Convert distribution dict to ParameterSampler format."""
    from sklearn.model_selection import ParameterSampler
    space = {}
    for k, v in dist.items():
        if isinstance(v, (list, np.ndarray)):
            space[k] = v
        else:
            space[k] = [v]
    return ParameterSampler(space, n_iter=n_iter, random_state=rng)

def manual_cv_search_clf(label, base_est, dist, X, y, n_iter=N_ITER_CLF):
    """Manual CV search for classification with tqdm progress bar."""
    sampler = list(param_sampler_helper(dist, n_iter, rng_cat))
    pbar = tqdm(total=len(sampler)*CV_FOLDS_CAT, desc=f"{label} tuning", unit="fits")
    
    best_score = -np.inf
    best_params = None
    
    for params in sampler:
        est = clone(base_est).set_params(**params)
        try:
            # Use macro F1 for multi-class fairness
            scores = cross_val_score(est, X, y, scoring=scoring_clf, cv=cv_cat, n_jobs=N_JOBS)
            score = np.mean(scores)
        except Exception as e:
            score = -np.inf
            
        pbar.update(CV_FOLDS_CAT)
        
        if score > best_score:
            best_score = score
            best_params = deepcopy(params)
    
    pbar.close()
    
    # Fit best model on full training set
    best = clone(base_est).set_params(**best_params)
    t0 = time.perf_counter()
    best.fit(X, y)
    fit_time = time.perf_counter() - t0
    
    print(f"  Best CV ({scoring_clf}): {best_score:.4f}")
    print(f"  Best params: {best_params}")
    print(f"  Fit time: {fit_time:.2f}s")
    
    return best, best_score, best_params

# ---------- Run classification tuning ----------
print("\n► Tuning Logistic Regression (Category)...")
best_log_clf, best_log_score, best_log_params = manual_cv_search_clf(
    "LogReg (Category)", log_clf, log_dist, X_cat_train_scaled, y_cat_train, n_iter=N_ITER_CLF
)

print("\n► Tuning Linear SVC (Category)...")
best_svc_clf, best_svc_score, best_svc_params = manual_cv_search_clf(
    "LinearSVC (Category)", svc_clf, svc_dist, X_cat_train_scaled, y_cat_train, n_iter=N_ITER_CLF
)

print("\n► Tuning Random Forest (Category)...")
best_rf_clf, best_rf_score, best_rf_params = manual_cv_search_clf(
    "RandomForest (Category)", rf_clf, rf_dist, X_cat_train_scaled, y_cat_train, n_iter=N_ITER_CLF*2
)

# ---------- Evaluate on test set ----------
def evaluate_classifier(name, est, X_test, y_test, y_test_raw=None):
    """Evaluate and return metrics for a classification model."""
    y_pred = est.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    return {
        "Model": name,
        "Accuracy": accuracy,
        "Precision (macro)": precision_macro,
        "Recall (macro)": recall_macro,
        "F1 (macro)": f1_macro,
        "F1 (weighted)": f1_weighted,
    }, y_pred

print("\n" + "="*70)
print("CLASSIFICATION TEST SET RESULTS")
print("="*70)

clf_results = []
clf_predictions = {}

result_log, pred_log = evaluate_classifier(
    "Logistic Regression (best)", best_log_clf, X_cat_test_scaled, y_cat_test
)
clf_results.append(result_log)
clf_predictions["LogReg"] = pred_log

result_svc, pred_svc = evaluate_classifier(
    "Linear SVC (best)", best_svc_clf, X_cat_test_scaled, y_cat_test
)
clf_results.append(result_svc)
clf_predictions["LinearSVC"] = pred_svc

result_rf, pred_rf = evaluate_classifier(
    "Random Forest (best)", best_rf_clf, X_cat_test_scaled, y_cat_test
)
clf_results.append(result_rf)
clf_predictions["RandomForest"] = pred_rf

# Display results
clf_results_df = pd.DataFrame(clf_results).sort_values("F1 (macro)", ascending=False)
print("\nCategory Classification Results (Test Set):")
display(clf_results_df)

# Store best model for later use
best_clf_model = best_rf_clf  # Default to RF; can be changed based on preference
best_clf_name = "Random Forest"

# ---------- Show detailed report for best model ----------
print(f"\n{best_clf_name} - Detailed Classification Report:")
print(classification_report(y_cat_test, pred_rf, target_names=le_cat.classes_, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_cat_test, pred_rf)
cm_df = pd.DataFrame(cm, index=[f"True: {c}" for c in le_cat.classes_],
                      columns=[f"Pred: {c}" for c in le_cat.classes_])
print("\nConfusion Matrix:")
display(cm_df)

print("\n✅ Classification model tuning complete!")
print(f"   Best model saved: best_rf_clf (Random Forest)")
print(f"   Best CV score (macro F1): {best_rf_score:.4f}")



CLASSIFICATION: Expense Category Prediction (Multi-class)

Dataset for category classification:
  Features (X): (155939, 18)
  Target classes: 8 → ['Eating_Out', 'Education', 'Entertainment', 'Groceries', 'Healthcare', 'Miscellaneous', 'Transport', 'Utilities']
  Class distribution:
0    20000
1    15939
2    20000
3    20000
4    20000
5    20000
6    20000
7    20000
Name: count, dtype: int64

Train shape: (124751, 18) | Test shape: (31188, 18)

► Tuning Logistic Regression (Category)...


LogReg (Category) tuning:   0%|          | 0/45 [00:00<?, ?fits/s]

  Best CV (f1_macro): 0.4890
  Best params: {'solver': 'lbfgs', 'class_weight': None, 'C': np.float64(75.7947995334801)}
  Fit time: 21.65s

► Tuning Linear SVC (Category)...


LinearSVC (Category) tuning:   0%|          | 0/45 [00:00<?, ?fits/s]

  Best CV (f1_macro): 0.3991
  Best params: {'C': np.float64(13.826232179369875)}
  Fit time: 275.03s

► Tuning Random Forest (Category)...


RandomForest (Category) tuning:   0%|          | 0/90 [00:00<?, ?fits/s]

  Best CV (f1_macro): 0.4198
  Best params: {'n_estimators': np.int64(139), 'min_samples_split': np.int64(3), 'min_samples_leaf': np.int64(4), 'max_features': 'sqrt', 'max_depth': np.int64(12)}
  Fit time: 30.98s

CLASSIFICATION TEST SET RESULTS

Category Classification Results (Test Set):


Unnamed: 0,Model,Accuracy,Precision (macro),Recall (macro),F1 (macro),F1 (weighted)
0,Logistic Regression (best),0.510773,0.4923,0.511121,0.489787,0.489009
2,Random Forest (best),0.471239,0.410746,0.468697,0.415909,0.41541
1,Linear SVC (best),0.437444,0.42541,0.442923,0.394646,0.393984



Random Forest - Detailed Classification Report:
               precision    recall  f1-score   support

   Eating_Out       0.17      0.06      0.09      4000
    Education       0.53      0.37      0.44      3188
Entertainment       0.16      0.06      0.08      4000
    Groceries       0.91      0.99      0.95      4000
   Healthcare       0.34      0.69      0.46      4000
Miscellaneous       0.62      0.86      0.72      4000
    Transport       0.40      0.66      0.50      4000
    Utilities       0.15      0.06      0.09      4000

     accuracy                           0.47     31188
    macro avg       0.41      0.47      0.42     31188
 weighted avg       0.41      0.47      0.42     31188


Confusion Matrix:


Unnamed: 0,Pred: Eating_Out,Pred: Education,Pred: Entertainment,Pred: Groceries,Pred: Healthcare,Pred: Miscellaneous,Pred: Transport,Pred: Utilities
True: Eating_Out,244,0,463,0,2060,1027,39,167
True: Education,1,1183,2,333,50,0,1466,153
True: Entertainment,489,0,232,2,2117,1004,38,118
True: Groceries,0,18,0,3976,0,0,5,1
True: Healthcare,472,2,430,0,2746,38,60,252
True: Miscellaneous,237,0,317,0,7,3439,0,0
True: Transport,7,613,6,23,80,0,2638,633
True: Utilities,18,434,16,13,940,0,2340,239



✅ Classification model tuning complete!
   Best model saved: best_rf_clf (Random Forest)
   Best CV score (macro F1): 0.4198


In [8]:
# 6. Summary of tuned models

summary_rows = []

# Savings regression (from 5A)
summary_rows.append({
    "Task": "Savings (regression)",
    "Best model": "Random Forest",
    "Test MAE": reg_results_df.loc[reg_results_df["Model"]=="Random Forest (best)", "MAE"].item(),
    "Test RMSE": reg_results_df.loc[reg_results_df["Model"]=="Random Forest (best)", "RMSE"].item(),
    "Test R2": reg_results_df.loc[reg_results_df["Model"]=="Random Forest (best)", "R2"].item(),
})

# Category classification (from 5B)
summary_rows.append({
    "Task": "Category (multi-class)",
    "Best model": clf_results_df.iloc[0]["Model"],
    "Test Accuracy": clf_results_df.iloc[0]["Accuracy"],
    "Test F1 (macro)": clf_results_df.iloc[0]["F1 (macro)"],
    "Test F1 (weighted)": clf_results_df.iloc[0]["F1 (weighted)"],
})

summary_df = pd.DataFrame(summary_rows)
print("=== Tuned Model Summary ===")
display(summary_df)

=== Tuned Model Summary ===


Unnamed: 0,Task,Best model,Test MAE,Test RMSE,Test R2,Test Accuracy,Test F1 (macro),Test F1 (weighted)
0,Savings (regression),Random Forest,146.344882,541.431512,0.995489,,,
1,Category (multi-class),Logistic Regression (best),,,,0.510773,0.489787,0.489009


In [9]:
# 7. Save best models for reuse

import joblib
from pathlib import Path

models_dir = Path("../data/models")
models_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(best_rf_savings, models_dir / "best_savings_rf.pkl")
joblib.dump(best_rf_clf,    models_dir / "best_category_rf.pkl")

print("Saved:")
print(models_dir / "best_savings_rf.pkl")
print(models_dir / "best_category_rf.pkl")

Saved:
../data/models/best_savings_rf.pkl
../data/models/best_category_rf.pkl
