In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv("heloc_preprocessed.csv")

simple_feature_names = ["Overall Credit Risk Score", "Months Since First Credit Account", "Average Age of Credit Accounts", "Number of Well-Maintained Accounts", "Percentage of Accounts Never Late",
                            "Months Since Last Missed Payment", "Percentage of Installment vs Revolving Loans", "Time Since Last Credit Application", "Credit Utilization Ratio", "Number of Active Credit Cards/Lines", "Loan Repaid"]

df_simple = df.copy()
df_simple.columns = simple_feature_names

y = df_simple["Loan Repaid"]
X = df_simple.drop(columns="Loan Repaid")

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier
ebm = ExplainableBoostingClassifier(random_state=)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, classification_report
from interpret.glassbox import ExplainableBoostingClassifier
from itertools import product
import joblib

n_folds = 5
random_state = 42

model_name = "EBM"
ebm_hyperparameters = {
    "max_bins": [256, 512],
    "interactions": [0, 10, 20],
    "outer_bags": [8, 16],
    "inner_bags": [0, 4],
}

overall_best_hp_config = None
overall_best_loss = np.inf

# Split off final test set before training
X_train_full, X_final_test, y_train_full, y_final_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=random_state
)

# Use KFold for cross-validation
outer_cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

for fold_i, (train_val_idx, test_idx) in enumerate(outer_cv.split(X_train_full, y_train_full)):
    print(f"\n----- Model: {model_name} -- Fold: {fold_i + 1}/{n_folds} -----")

    # Split train-validation-test
    X_train_val, y_train_val = X_train_full.iloc[train_val_idx], y_train_full.iloc[train_val_idx]
    X_test, y_test = X_train_full.iloc[test_idx], y_train_full.iloc[test_idx]

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=random_state
    )

    # Define preprocessing pipeline
    num_pipe = Pipeline([("scaler", StandardScaler())])
    ct = ColumnTransformer([("num", num_pipe, X.columns.tolist())])
    ct.fit(X_train)

    # Transform data
    X_train_transformed = pd.DataFrame(ct.transform(X_train), columns=ct.get_feature_names_out())
    X_val_transformed = pd.DataFrame(ct.transform(X_val), columns=ct.get_feature_names_out())

    # Grid search over hyperparameters
    best_hp_config = None
    best_loss = np.inf

    for hp in product(*ebm_hyperparameters.values()):
        params = dict(zip(ebm_hyperparameters.keys(), hp))
        params["random_state"] = random_state  # Ensure consistency

        model = ExplainableBoostingClassifier(**params)
        model.fit(X_train_transformed, y_train)

        y_val_pred_proba = model.predict_proba(X_val_transformed)
        ce_loss = log_loss(y_val, y_val_pred_proba)

        if ce_loss < best_loss:
            best_loss = ce_loss
            best_hp_config = params
            best_model = model  # Store best model

    print(f"Best hyperparameters for fold {fold_i + 1}: {best_hp_config}")

    # Train final model on full train-val data with best params
    X_train_val_transformed = pd.DataFrame(
        ct.transform(X_train_val), columns=ct.get_feature_names_out()
    )
    X_test_transformed = pd.DataFrame(ct.transform(X_test), columns=ct.get_feature_names_out())

    final_model = ExplainableBoostingClassifier(**best_hp_config, random_state=random_state)
    final_model.fit(X_train_val_transformed, y_train_val)

    # Evaluate on test set
    y_test_pred = final_model.predict(X_test_transformed)
    print(classification_report(y_test, y_test_pred))

    # Track best overall hyperparameters
    if best_loss < overall_best_loss:
        overall_best_loss = best_loss
        overall_best_hp_config = best_hp_config

# Train final model on full training data (excluding X_final_test)
final_model = ExplainableBoostingClassifier(**overall_best_hp_config, random_state=random_state)

# Standardize using entire training data
ct_final = ColumnTransformer([("num", StandardScaler(), X_train_full.columns.tolist())])
ct_final.fit(X_train_full)

# Transform the training data
X_train_full_transformed = pd.DataFrame(
    ct_final.transform(X_train_full), columns=ct_final.get_feature_names_out()
)

# Train model
final_model.fit(X_train_full_transformed, y_train_full)

# Transform final test set
X_final_test_transformed = pd.DataFrame(
    ct_final.transform(X_final_test), columns=ct_final.get_feature_names_out()
)

# Predict on final test set
y_final_pred = final_model.predict(X_final_test_transformed)

# Compute final test metrics
print("\nFinal Model Evaluation on Hold-Out Test Set:")
print(classification_report(y_final_test, y_final_pred))

# Save final model and preprocessor
joblib.dump(final_model, "final_ebm_model.pkl")
joblib.dump(ct_final, "final_preprocessor.pkl")
