In [8]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, average_precision_score,
    precision_score, recall_score, f1_score, log_loss
)
from sklearn.compose import ColumnTransformer
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from itertools import product
from sklearn.ensemble import RandomForestClassifier
import json
import joblib

models = {
    "LR": LogisticRegression,
    "RF": RandomForestClassifier,
    "XGB": XGBClassifier,
    "EBM": ExplainableBoostingClassifier,
}

with open("hpo_grid.json", "r") as f:
    hpo_grid = json.load(f)

n_folds = 5
random_state = 42

df = pd.read_csv("heloc-preprocessed-simple.csv")
y = df["Loan Repaid"]
X = df.drop(columns="Loan Repaid")

In [2]:
# Use KFold (not stratified) since dataset is fairly balanced (48:52)
outer_cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

results = []

# TODO: Save best EBM (based roc-auc score)

# Iterate over each model
for model_name, hyperparams in hpo_grid.items():
    print(f"\n===== Evaluating Model: {model_name} =====")

    if model_name == "EBM":
        best_ebm = None
        best_ebm_roc_auc = float('-inf')
        best_ebm_ct = None
        best_ebm_test_X = None
        best_ebm_test_y = None

    # Run 5-fold cross-validation
    for fold_i, (train_val_idx, test_idx) in enumerate(outer_cv.split(X, y)):
        print(f"\n----- Model: {model_name} -- Fold: {fold_i + 1}/{n_folds} -----")

        # Split train-val and test set for fold
        X_train_val, y_train_val = X.iloc[train_val_idx], y.iloc[train_val_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        # Split train-val into train and val for hpo
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=random_state
        )

        scaler = ColumnTransformer([("num", StandardScaler(), X.columns)])
        X_train_scaled = scaler.fit_transform(X_train)  # Fit ONLY on current train set
        X_val_scaled = scaler.transform(X_val)  # Use same scaler on val set
        X_test_scaled = scaler.transform(X_test) # Use same scaler on test set

        # Track best HPs per fold
        best_fold_hp_config = None
        best_fold_loss = np.inf
        best_fold_model = None

        # Grid search over hyperparameters
        for hp in product(*hyperparams.values()):
            params = dict(zip(hyperparams.keys(), hp))

            # Initialize and train model
            model = models[model_name](**params)
            model.fit(X_train_scaled, y_train)

            # TODO is this correct?? Should I use predict_proba() and log_loss or something else?
            # TODO -> All models have predict_proba() -> remove else statements?
            y_val_pred_proba = model.predict_proba(X_val_scaled) if hasattr(model, "predict_proba") else model.predict(X_val)
            ce_loss = log_loss(y_val, y_val_pred_proba) if hasattr(model, "predict_proba") else np.nan

            if ce_loss < best_fold_loss:
                best_fold_loss = ce_loss
                best_fold_hp_config = params
                best_fold_model = model  # Store the best model found

        # After the grid search, select the best model and print hyperparameters
        print(f"Best hyperparameters for {model_name} in Fold {fold_i + 1}: {best_fold_hp_config}")

        for dataset_name, X, y in [
            ("train", X_train_scaled, y_train),
            ("val", X_val_scaled, y_val),
            ("test", X_test_scaled, y_test),
        ]:
            y_pred = best_fold_model.predict(X)
            # TODO: Why indexing in this way?
            y_pred_proba = best_fold_model.predict_proba(X)[:, 1] if hasattr(best_fold_model, "predict_proba") else None

            scores = {
                "model": model_name,
                "fold": fold_i + 1,
                "dataset": dataset_name,
                "accuracy": accuracy_score(y, y_pred),
                "roc_auc": roc_auc_score(y, y_pred_proba, average="macro") if y_pred_proba is not None else np.nan,
                "pr_auc": average_precision_score(y, y_pred_proba, average="macro") if y_pred_proba is not None else np.nan,
                "precision": precision_score(y, y_pred, average="macro", zero_division=0),
                "recall": recall_score(y, y_pred, average="macro", zero_division=0),
                "f1_score": f1_score(y, y_pred, average="macro")
            }

            if model_name == "EBM" and scores["roc_auc"] > best_ebm_roc_auc:
                best_ebm = best_fold_model
                best_ebm_roc_auc = scores["roc_auc"]
                best_ebm_ct = scaler
                best_ebm_test_X = X_test
                best_ebm_test_y = y_test

            results.append(scores)

    #TODO: best_ebm ist skaliert -> Shape Functions sind nicht interpretierbar
    if model_name == "EBM":
        joblib.dump(best_ebm, "best-ebm.pkl")
        #TODO: Save scaler, X_test and y_test of best_ebm fold

df_results = pd.DataFrame(results)
df_results.to_csv("evaluation_results.csv", index=False)
print("Results saved to evaluation_results.csv")

# Save model
#joblib.dump(final_model, "final_ebm_standardized_model.pkl")
#joblib.dump(scaler, "scaler.pkl")



----- Model: EBM -- Fold: 1/5 -----
Best hyperparameters: {'max_bins': 256, 'interactions': 10, 'outer_bags': 8, 'inner_bags': 4, 'random_state': 42}

----- Model: EBM -- Fold: 2/5 -----
Best hyperparameters: {'max_bins': 512, 'interactions': 20, 'outer_bags': 16, 'inner_bags': 4, 'random_state': 42}

----- Model: EBM -- Fold: 3/5 -----
Best hyperparameters: {'max_bins': 512, 'interactions': 20, 'outer_bags': 16, 'inner_bags': 4, 'random_state': 42}

----- Model: EBM -- Fold: 4/5 -----
Best hyperparameters: {'max_bins': 512, 'interactions': 20, 'outer_bags': 16, 'inner_bags': 4, 'random_state': 42}

----- Model: EBM -- Fold: 5/5 -----
Best hyperparameters: {'max_bins': 256, 'interactions': 20, 'outer_bags': 16, 'inner_bags': 0, 'random_state': 42}
              precision    recall  f1-score   support

           0       0.74      0.71      0.73       819
           1       0.70      0.73      0.71       750

    accuracy                           0.72      1569
   macro avg       0.72

['scalers.pkl']