In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv("heloc_preprocessed.csv")

simple_feature_names = ["Overall Credit Risk Score", "Months Since First Credit Account", "Average Age of Credit Accounts", "Number of Well-Maintained Accounts", "Percentage of Accounts Never Late",
                            "Months Since Last Missed Payment", "Percentage of Installment vs Revolving Loans", "Time Since Last Credit Application", "Credit Utilization Ratio", "Number of Active Credit Cards/Lines", "Loan Repaid"]

df_simple = df.copy()
df_simple.columns = simple_feature_names

y = df_simple["Loan Repaid"]
X = df_simple.drop(columns="Loan Repaid")

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from datetime import datetime
import itertools
import json
import os

n_folds = 5
random_state = 42
verbose = 2

best_hpo_config_csvs = []
    for i in range(1, n_folds + 1):
        if not os.path.exists(f"{directory}/hpo_best_config_Fold_{i}.csv"):
            best_hpo_config_csvs.append(
                pd.DataFrame(
                    index=classification_datasets + regression_datasets,
                    columns=traditional_models_to_run + gam_models_to_run,
                )
            )
        else:
            best_hpo_config_csvs.append(
                pd.read_csv(
                    f"{directory}/hpo_best_config_Fold_{i}.csv", index_col=0, header=0
                )
            )

hyperparameter_config_file = "./default_hyperparams.json"

with open(hyperparameter_config_file, "r") as read_file:
    hpo_grid = json.load(read_file)

# TODO: Add XGB hyperparameters
keys, values = zip(*hpo_grid['EBM'].items())
permutations_dicts = [
                dict(zip(keys, v)) for v in itertools.product(*values)
            ]

# Use regular KFold because dataset is fairly balanced (0 = 5459, 1 = 5000)
outer_cv = KFold(
    n_splits=n_folds, shuffle=True, random_state=random_state
)
model_name = "EBM"
for fold_i, (train_val_idx, test_idx) in enumerate(outer_cv.split(X, y)):
    print(
                    "\n",
                    "-" * 5,
                    "Model:",
                    model_name,
                    "-- Fold:",
                    fold_i + 1,
                    "/",
                    n_folds,
                    "-" * 5,
                )
    X_train_val, y_train_val = X.iloc[train_val_idx], y.iloc[train_val_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

    num_pipe = Pipeline([("scaler", StandardScaler())])
    ct = ColumnTransformer(transformers=[("num", num_pipe, X.columns.tolist())])


    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val,
        y_train_val,
        test_size=0.25,
        stratify=y_train_val,
        random_state=1337,
    )

    ct.fit(X_train)

    X_train = pd.DataFrame(
        ct.transform(X_train), columns=ct.get_feature_names_out()
    )
    X_val = pd.DataFrame(
        ct.transform(X_val), columns=ct.get_feature_names_out()
    )
    all_transformed_feature_names = ct.get_feature_names_out()

    # Now you have the correctly mapped and ordered lists of transformed feature names
    transformed_numerical_names = [
        name
        for name in all_transformed_feature_names
        if name.startswith("num__")
    ]
    transformed_categorical_names = [
        name
        for name in all_transformed_feature_names
        if name.startswith("cat__")
    ]

    print("Dataset Categorical Columns:", dataset.categorical_cols)
    print("Dataset Numerical Columns:", dataset.numerical_cols)

    print("Transformed Categorical Columns:", transformed_categorical_names)
    print("Transformed Numerical Columns:", transformed_numerical_names)

    if model_name == "MLP":
        X_train = X_train.values
        X_val = X_val.values

    if verbose == 1:
        print("")

    best_hp_config = None
    best_loss = np.inf
    training_time_of_best_model = np.inf
    timings_hpo = []
    # tuning hyperparameters in case of multiple hyperparameter candidates
    logger.set_current_dataset_model_dir(dataset_name, model_name)

    for id, arg_dict in enumerate(permutations_dicts):
        # print the progress with replacing in line all the time
        if verbose == 1:
            print(
                "\r",
                "Progress: ",
                id + 1,
                "/",
                len(permutations_dicts),
                end="",
            )
        elif verbose == 2:
            print("-" * 20)
            print(arg_dict)

        # define the model
        model = Model(
            model_name,
            task,
            arg_dict,
            num_cols=transformed_numerical_names,
            cat_cols=transformed_categorical_names,
        )

        start_training_time = datetime.now()

        try:
            # fit the model
            model.fit(X_train, y_train)
        except (LinAlgError, OptimizationError) as e:
            print(e)
            warnings.warn(
                "Training with this hp combination, Error in Gaminet (Optimization Error, warm start) or Pygam (LinAlgError) possible"
            )
            continue

        training_time = (
            datetime.now() - start_training_time
        ).total_seconds()
        timings_hpo.append(training_time)

        if task == "classification":
            # calculate the loss
            y_pred = model.predict(X_val)
            ce_loss = log_loss(y_val, y_pred)

            if ce_loss < best_loss:
                best_hp_config = arg_dict
                best_loss = ce_loss
                training_time_of_best_model = training_time

    best_hpo_string = (
        str(best_hp_config)
        .replace("{", "")
        .replace("}", "")
        .replace(",", "\n")
    )
    best_hpo_config_csvs[fold_i].loc[
        dataset_name, model_name
    ] = best_hpo_string

    # now take the best hpo config and retrain on X_train_val and y_train_val
    ct_test = ColumnTransformer(transformers=transformers)
    ct_test.fit(X_train_val)

    X_train_val = pd.DataFrame(
        ct_test.transform(X_train_val),
        columns=ct_test.get_feature_names_out(),
    )
    X_test = pd.DataFrame(
        ct_test.transform(X_test),
        columns=ct_test.get_feature_names_out(),
    )

    all_transformed_feature_names = ct_test.get_feature_names_out()

    transformed_numerical_names = [
        name
        for name in all_transformed_feature_names
        if name.startswith("num__")
    ]
    transformed_categorical_names = [
        name
        for name in all_transformed_feature_names
        if name.startswith("cat__")
    ]

    # Now you have the correctly mapped and ordered lists of transformed feature names
    print("Transformed Categorical Columns:", transformed_categorical_names)
    print("Transformed Numerical Columns:", transformed_numerical_names)

    if model_name == "MLP":
        X_train_val = X_train_val.values
        X_test = X_test.values

    best_model = Model(
        model_name,
        task,
        best_hp_config,
        num_cols=transformed_numerical_names,
        cat_cols=transformed_categorical_names,
    )
    try:
        best_model.fit(X_train_val, y_train_val)
    except (OptimizationError, LinAlgError) as e:
        print(e)
        warnings.warn(
            "Training with this hp combination, Error in Gaminet (Optimization Error, warm start) or Pygam (LinAlgError) possible"
        )
        continue
    else:
        # evaluate the retrained best model on the hold out dataset
        y_pred = best_model.predict(X_test)
        if task == "classification":
            y_pred_proba = best_model.predict_proba(X_test)

    if task == "classification":
        logger.log_classification_report(
            y_true=y_test, y_pred=y_pred, dataset=dataset, k_fold=fold_i
        )
        logger.log_roc_auc(
            y_true=y_test, y_pred_confidence=y_pred_proba, k_fold=fold_i
        )
    elif task == "regression":
        logger.log_regression_report(
            y_true=y_test, y_pred=y_pred, k_fold=fold_i
        )

    logger.log_timing(
        training_time_of_best_model, np.mean(timings_hpo), fold_i
    )

for i in range(n_folds):
    best_hpo_config_csvs[i].to_csv(
        f"{directory}/hpo_best_config_Fold_{i + 1}.csv",
        index=True,
        header=True,

