In [None]:
# Within this notebook we adopt the convention that for metrics lower is better.
# For metrics where higher is better like AUC we flip the sign to negative.

is_azure = False  # if this is set to True, login with 'az login' before
n_replicates = 25  # 25 is almost costless since with 30 we get to saturation of 650 runners in 2 hours
n_instances = 650

force_recreate = False
exist_ok = True
TIMEOUT_SEC = 60 * 60 * 24 * 180  # 180 days
wheel_filepaths = ['interpret_core-0.6.8-py3-none-any.whl', 'powerlift-0.1.12-py3-none-any.whl']

import datetime
experiment_name = datetime.datetime.now().strftime('%Y_%m_%d_%H%M__') + 'myexperiment'
# experiment_name = 'yyyy_mm_dd_hhmm__myexperiment'

print('Experiment name: ' + experiment_name)

In [None]:
# use exact versions for reproducibility of the RANK ordering
requirements = "numpy==1.26.4 pandas==2.2.2 scikit-learn==1.5.1 optuna==4.0.0 optuna-integration==4.0.0 xgboost==2.1.0 lightgbm==4.5.0 catboost==1.2.5 aplr==10.6.1"
!pip install -U --quiet {requirements}

In [None]:
# install interpret if not already installed
try:
    import interpret
except ModuleNotFoundError:
    !pip install -U --quiet interpret-core

In [None]:
# install powerlift if not already installed
try:
    import powerlift
except ModuleNotFoundError:
    !pip install -U --quiet powerlift[datasets,postgres]

In [None]:
import os
if is_azure:
    import requests
    import json
    import subprocess
    from azure.identity import AzureCliCredential
    credential = AzureCliCredential()
    access_token = credential.get_token("https://graph.microsoft.com/.default").token
    headers = {'Authorization': f'Bearer {access_token}', 'Content-Type': 'application/json'}
    azure_client_id = requests.get('https://graph.microsoft.com/v1.0/me', headers=headers).json().get('id')
    azure_tenant_id = requests.get('https://graph.microsoft.com/v1.0/organization', headers=headers).json()['value'][0].get('id')
    subscription_id = json.loads(subprocess.run("az account show", capture_output=True, text=True, shell=True).stdout).get("id")

    from dotenv import load_dotenv
    load_dotenv()
    conn_str = os.getenv("DOCKER_DB_URL")
    resource_group = os.getenv("AZURE_RESOURCE_GROUP")
else:
    conn_str = f"sqlite:///{os.getcwd()}/powerlift.db"

from powerlift.bench import Store, Benchmark
store = Store(conn_str, force_recreate=force_recreate)
benchmark = Benchmark(store, name=experiment_name)

In [None]:
def trial_filter(task):
    min_samples = 1
    max_samples = 1000000000000
    min_features = 1
    max_features = 1000000000000
    if task.n_samples < min_samples:
        return []
    if max_samples < task.n_samples:
        return []
    if task.n_features < min_features:
        return []
    if max_features < task.n_features:
        return []


    if task.origin == "openml_automl_regression":
        pass  # include in benchmark
    elif task.origin == "openml_cc18":
        pass  # include in benchmark
    elif task.origin == "openml_automl_classification":
        return []
    elif task.origin == "pmlb":
        if task.problem == "binary":
            return []
        elif task.problem == "multiclass":
            return []
        elif task.problem == "regression":
            return []
        else:
            raise Exception(f"Unrecognized problem {task.problem}")
    else:
        raise Exception(f"Unrecognized origin {task.origin}")

    
    exclude_set = set()

    exclude_set = set(['Devnagari-Script', 'mnist_784', 'isolet'])  # TODO: reintroduce

#    exclude_set = set([
#        'Fashion-MNIST', 'mfeat-pixel', 'Bioresponse',
#        'mfeat-factors', 'isolet', 'cnae-9', "Internet-Advertisements",
#        'har', 'Devnagari-Script', 'mnist_784', 'CIFAR_10',
#        'Airlines_DepDelay_10M',
#    ])
    if task.name in exclude_set:
        return []


    # exclude duplicates of a dataset if they appear twice
    global global_duplicates
    try:
        duplicates = global_duplicates
    except NameError:
        duplicates = set()
        global_duplicates = duplicates
    key = (task.name, task.n_samples, task.n_features)
    if key in duplicates:
        print(f"Excluding duplicate: {key}")
        return []
    else:
        duplicates.add(key)


    return [
        ("ebm", {}),
        ("ebm", {'interactions': 0}),
        #"ebm_opt",
        "xgb",
        #"xgb_opt",
        "lgbm",
        #"lgbm_opt",
        "catb",
        #"catb_opt",
        "rf_xgb",
        "rf_sk",
        "ert",
        "tree",
        "elastic",
        "sgd",
        "lm",
        "lsvm",
        "svm",
        "nn",
        "knn",
        "aplr",
    ]

In [None]:
def trial_runner(trial):
    seed = 42
    seed += trial.replicate_num
    max_samples = None
    n_calibration_folds = 4  # 4 uses all cores on the containers

    from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
    from interpret.develop import set_option
    from interpret.utils._native import Native
    from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor
    from lightgbm import LGBMClassifier, LGBMRegressor
    from catboost import CatBoostClassifier, CatBoostRegressor
    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor
    from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, SGDClassifier, SGDRegressor
    from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
    from sklearn.neural_network import MLPClassifier, MLPRegressor
    from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
    from aplr import APLRClassifier, APLRRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.impute import SimpleImputer
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.calibration import CalibratedClassifierCV
    import optuna
    from optuna_integration.sklearn import OptunaSearchCV
    from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss, precision_score, recall_score, accuracy_score, balanced_accuracy_score
    from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, median_absolute_error
    import numpy as np
    from time import time
    import warnings
    import gc
    import re
    import random

    # turn off AVX512F support since it adds variability in results given spotty architecture support
    set_option("acceleration", ~Native.AccelerationFlags_AVX512F)
    
    random.seed(seed)
    np.random.seed(seed)

    X, y = trial.task.data()

    if trial.task.problem == "regression":
        q75, q25 = np.percentile(y, [75, 25])
        interquartile_range = q75 - q25

    for col in X.columns:
        # catboost doesn't like missing categoricals, so make them a category
        col_data = X[col]
        if str(col_data.dtype) == "category" and col_data.isnull().any():
            X[col] = col_data.cat.add_categories('nan').fillna('nan')
    
    stratification = None
    if trial.task.problem in ["binary", "multiclass"]:
        # stratification = y
        pass  # Re-enable stratification if dataset fails from absent class in train/test sets (PMLB)

    # Airlines_DepDelay_10M crashes on 16GB machines using 20% test set when we require dense one hot encoded data (APLR).
    test_size = 0.2 if trial.task.name not in {"Airlines_DepDelay_10M"} else 0.1
    
    fit_params = {}
    fit_params["X"], X_test, fit_params["y"], y_test = train_test_split(X, y, test_size=test_size, stratify=stratification, random_state=seed)
    del y
    del X

    cat_bools = trial.task.meta["categorical_mask"]
    cat_cols = [i for i, val in enumerate(cat_bools) if val]
    num_cols = [i for i, val in enumerate(cat_bools) if not val]

    # Build optional preprocessor for use by methods below
    # missing categoricals already handled above by making new "nan" category
    cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True, dtype=np.int16)
    num_imputer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
    transformers = [("cat", cat_encoder, cat_cols), ("num", num_imputer, num_cols)]
    p = ColumnTransformer(transformers=transformers)  #, sparse_threshold=1.0)  # densify or sparsify


    ebm_params = trial.meta
    xgb_params = trial.meta
    lgbm_params = trial.meta
    catboost_params = trial.meta
    rf_xgb_params = trial.meta
    rf_sk_params = trial.meta
    ert_params = trial.meta
    tree_params = trial.meta
    elastic_params = trial.meta
    sgd_params = trial.meta
    lm_params = trial.meta
    lsvm_params = trial.meta
    svm_params = trial.meta
    nn_params = trial.meta
    knn_params = trial.meta
    aplr_params = trial.meta

    ebm_params["feature_types"] = ["nominal" if cat else "continuous" for cat in cat_bools]
    ebm_params["n_jobs"] = -1
    xgb_params["enable_categorical"] = True
    xgb_params["feature_types"] = ["c" if cat else "q" for cat in cat_bools]
    lgbm_params["verbosity"] = -1
    catboost_params["verbose"] = False
    rf_xgb_params["enable_categorical"] = True
    rf_xgb_params["feature_types"] = ["c" if cat else "q" for cat in cat_bools]
    rf_sk_params["random_state"] = seed
    rf_sk_params["n_jobs"] = -1
    ert_params["n_jobs"] = -1
    ert_params["random_state"] = seed
    tree_params["random_state"] = seed
    elastic_params["random_state"] = seed
    sgd_params["random_state"] = seed
    lm_params["n_jobs"] = -1
    lsvm_params["random_state"] = seed
    nn_params["random_state"] = seed
    knn_params["n_jobs"] = -1
    aplr_params["m"] = 3000

    if 1700 < trial.task.n_features:
        # TODO: EBMs can crash for now with too many interactions, so limit it until we have better fix
        # Bioresponse with 1776 features works most of the time, but occasionally fails.
        # Santander_transaction_value with 4991 features does not work.
        ebm_params["interactions"] = 0

    # DEBUG params to make the algorithms super fast
    #if 10000 < len(fit_params["y"]):
    #    debug_stratify = fit_params["y"] if trial.task.problem in ["binary", "multiclass"] else None
    #    _, fit_params["X"], _, fit_params["y"] = train_test_split(fit_params["X"], fit_params["y"], test_size=5000, stratify=debug_stratify, random_state=seed)
    #ebm_params["max_rounds"] = 1
    #ebm_params["interactions"] = 0
    #xgb_params["n_estimators"] = 1
    #lgbm_params["n_estimators"] = 1
    #catboost_params["n_estimators"] = 1
    #rf_xgb_params["n_estimators"] = 1
    #rf_sk_params["n_estimators"] = 1
    #ert_params["n_estimators"] = 1
    #tree_params["max_depth"] = 1
    #elastic_params["max_iter"] = 1
    #sgd_params["max_iter"] = 1
    #lsvm_params["max_iter"] = 1
    #nn_params["max_iter"] = 1
    #knn_params["n_neighbors"] = 1
    #knn_params["leaf_size"] = 1
    #aplr_params["m"] = 1

    # for these datasets, we have to subsample so much it is probably better to just use non-optimized
    ebm_classification_non_opt = {"Devnagari-Script", "CIFAR_10", "Fashion-MNIST", "mnist_784", "isolet", "MiceProtein", "cnae-9", "Bioresponse", "Internet-Advertisements", "madelon", "har", "texture"}
    xgb_classification_non_opt = {"Devnagari-Script", "CIFAR_10", "mnist_784", "Fashion-MNIST", "isolet"}
    lgbm_classification_non_opt = {"Devnagari-Script", "CIFAR_10", "mnist_784", "Fashion-MNIST", "isolet"}
    catb_classification_non_opt = {"Devnagari-Script", "CIFAR_10", "Fashion-MNIST", "mnist_784", "isolet", "MiceProtein", "madelon"}

    # Specify method
    if trial.task.problem in ["binary", "multiclass"]:
        if trial.method == "ebm" or trial.method == "ebm_opt" and trial.task.name in ebm_classification_non_opt:
            for param, val in ebm_params.copy().items():
                try:
                    set_option(param, val)
                    del ebm_params[param]
                except:
                    pass
            est = ExplainableBoostingClassifier(**ebm_params)
        elif trial.method == "ebm_opt":
            for param, val in ebm_params.copy().items():
                try:
                    set_option(param, val)
                    del ebm_params[param]
                except:
                    pass
            # TODO: change these optimization parameters
            param_grid = {
                'smoothing_rounds': optuna.distributions.IntDistribution(1, 4000, log=True),
                'interactions': optuna.distributions.FloatDistribution(0.0, 0.999),
                'inner_bags': optuna.distributions.IntDistribution(0, 0, step=50),  # would prefer 50
                'max_bins': optuna.distributions.IntDistribution(256, 65536, log=True),
                'max_interaction_bins': optuna.distributions.IntDistribution(8, 128, log=True),
                'greedy_ratio': optuna.distributions.FloatDistribution(0.0001, 4.0),
                'cyclic_progress': optuna.distributions.FloatDistribution(0.0, 1.0),
                'outer_bags': optuna.distributions.IntDistribution(14, 14),  # would prefer more
                'interaction_smoothing_rounds': optuna.distributions.IntDistribution(1, 500, log=True),
                'learning_rate': optuna.distributions.FloatDistribution(0.0025, 0.5, log=True),
                'max_leaves': optuna.distributions.IntDistribution(2, 5),
                'min_samples_leaf': optuna.distributions.IntDistribution(2, 100, log=True),
                'min_hessian': optuna.distributions.FloatDistribution(0.000001, 10.0, log=True),
                'max_rounds': optuna.distributions.IntDistribution(25000, 25000),
                'early_stopping_rounds': optuna.distributions.IntDistribution(50, 50),
                'early_stopping_tolerance': optuna.distributions.FloatDistribution(1e-10, 1e-5, log=True),
                'validation_size': optuna.distributions.FloatDistribution(0.1, 0.5),
            }
            est = OptunaSearchCV(
                estimator=ExplainableBoostingClassifier(**ebm_params),
                param_distributions=param_grid,
                cv=n_calibration_folds,
                n_trials=50,
                scoring='neg_log_loss',
                verbose=0,
                random_state=seed,
                n_jobs=1  # EBM uses the cores efficiently
            )
        elif trial.method == "xgb" or trial.method == "xgb_opt" and trial.task.name in xgb_classification_non_opt:
            est = XGBClassifier(**xgb_params)
            fit_params["verbose"] = False
        elif trial.method == "xgb_opt":
            # from https://github.com/optuna/optuna-examples/blob/main/xgboost/xgboost_cv.py
            # TODO: change and harmonize these optimization parameters
            param_grid = {
                'n_estimators': optuna.distributions.IntDistribution(50, 2000, log=True),
                'max_depth': optuna.distributions.IntDistribution(1, 9),
                'learning_rate': optuna.distributions.FloatDistribution(0.005, 0.5, log=True),
                'gamma': optuna.distributions.FloatDistribution(1e-8, 1.0, log=True),
                'min_child_weight': optuna.distributions.FloatDistribution(2, 10),
                'subsample': optuna.distributions.FloatDistribution(0.2, 1.0),
                'colsample_bytree': optuna.distributions.FloatDistribution(0.2, 1.0),
                'reg_alpha': optuna.distributions.FloatDistribution(1e-8, 1.0, log=True),
                'reg_lambda': optuna.distributions.FloatDistribution(1e-8, 1.0, log=True),
                'grow_policy': optuna.distributions.CategoricalDistribution(["depthwise", "lossguide"]),
            }
            est = OptunaSearchCV(
                estimator=XGBClassifier(**xgb_params),
                param_distributions=param_grid,
                cv=n_calibration_folds,
                n_trials=50,
                scoring='neg_log_loss',
                verbose=0,
                random_state=seed,
                n_jobs=1  # catboost uses the cores efficiently
            )
            fit_params["verbose"] = False
        elif trial.method == "lgbm" or trial.method == "lgbm_opt" and trial.task.name in lgbm_classification_non_opt:
            est = LGBMClassifier(**lgbm_params)
            fit_params["categorical_feature"] = cat_cols
        elif trial.method == "lgbm_opt":
            # TODO: change and harmonize these optimization parameters
            param_grid = {
                'num_leaves': optuna.distributions.IntDistribution(2, 256, log=True),
                'max_depth': optuna.distributions.IntDistribution(-1, 30),
                'learning_rate': optuna.distributions.FloatDistribution(0.005, 0.5, log=True),
                'n_estimators': optuna.distributions.IntDistribution(50, 2000, log=True),
                'min_child_samples': optuna.distributions.IntDistribution(2, 100),
                'subsample_freq': optuna.distributions.IntDistribution(1, 1),
                'subsample': optuna.distributions.FloatDistribution(0.4, 1.0),
                'colsample_bytree': optuna.distributions.FloatDistribution(0.4, 1.0),
                'reg_alpha': optuna.distributions.FloatDistribution(1e-8, 10.0, log=True),
                'reg_lambda': optuna.distributions.FloatDistribution(1e-8, 10.0, log=True)
            }
            est = OptunaSearchCV(
                estimator=LGBMClassifier(**lgbm_params),
                param_distributions=param_grid,
                cv=n_calibration_folds,
                n_trials=50,
                scoring='neg_log_loss',
                verbose=0,
                random_state=seed,
                n_jobs=1  # lGBM uses the cores efficiently
            )
            fit_params["categorical_feature"] = cat_cols
        elif trial.method == "catb" or trial.method == "catb_opt" and trial.task.name in catb_classification_non_opt:
            est = CatBoostClassifier(**catboost_params)
            fit_params["cat_features"] = cat_cols
        elif trial.method == "catb_opt":
            # from https://forecastegy.com/posts/catboost-hyperparameter-tuning-guide-with-optuna/
            # TODO: change and harmonize these optimization parameters
            param_grid = {
                'learning_rate': optuna.distributions.FloatDistribution(1e-3, 0.1, log=True),
                'depth': optuna.distributions.IntDistribution(1, 10),
                'colsample_bylevel': optuna.distributions.FloatDistribution(0.05, 1.0),
                'min_data_in_leaf': optuna.distributions.IntDistribution(1, 100),
            }
            est = OptunaSearchCV(
                estimator=CatBoostClassifier(**catboost_params),
                param_distributions=param_grid,
                cv=n_calibration_folds,
                n_trials=50,
                scoring='neg_log_loss',
                verbose=0,
                random_state=seed,
                n_jobs=1  # catboost uses the cores efficiently
            )
            fit_params["cat_features"] = cat_cols
        elif trial.method == "rf_xgb":
            est = XGBRFClassifier(**rf_xgb_params)
            fit_params["verbose"] = False
        elif trial.method == "rf_sk":
            est = Pipeline([("p", p), ("est", RandomForestClassifier(**rf_sk_params))])
        elif trial.method == "ert":
            est = Pipeline([("p", p), ("est", ExtraTreesClassifier(**ert_params))])
        elif trial.method == "tree":
            est = Pipeline([("p", p), ("est", DecisionTreeClassifier(**tree_params))])
        elif trial.method == "elastic":
            elastic_params["n_jobs"] = -1
            est = Pipeline([("p", p), ("est", LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, **elastic_params))])
        elif trial.method == "sgd":
            est = Pipeline([("p", p), ("est", CalibratedClassifierCV(SGDClassifier(**sgd_params), n_jobs=-1, cv=n_calibration_folds))])
        elif trial.method == "lm":
            lm_params["random_state"] = seed
            est = Pipeline([("p", p), ("est", LogisticRegression(**lm_params))])
        elif trial.method == "lsvm":
            if trial.task.name in {"CIFAR_10"}:
                max_samples = 30000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Fashion-MNIST"}:
                max_samples = 20000  # OMM crashes without subsampling
            if trial.task.name in {"mnist_784"}:
                max_samples = 40000  # OMM crashes without subsampling
            est = Pipeline([("p", p), ("est", CalibratedClassifierCV(LinearSVC(**lsvm_params), n_jobs=-1, cv=n_calibration_folds))])
        elif trial.method == "svm":
            if trial.task.name in {"Fashion-MNIST"}:
                max_samples = 40000  # OMM crashes without subsampling
            if trial.task.name in {"CIFAR_10"}:
                max_samples = 50000  # crashes or fit time too long without subsampling
            svm_params["random_state"] = seed
            est = Pipeline([("p", p), ("est", CalibratedClassifierCV(SVC(**svm_params), n_jobs=-1, cv=n_calibration_folds))])
        elif trial.method == "nn":
            est = Pipeline([("p", p), ("est", MLPClassifier(**nn_params))])
        elif trial.method == "knn":
            est = Pipeline([("p", p), ("est", KNeighborsClassifier(**knn_params))])
        elif trial.method == "aplr":
            fit_params["y"] = fit_params["y"].astype(str)
            p.sparse_threshold = 0  # APLR only handles dense
            if trial.task.name in {"CIFAR_10"}:
                max_samples = 10000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Fashion-MNIST"}:
                max_samples = 20000  # crashes or fit time too long without subsampling
            if trial.task.name in {"mnist_784"}:
                max_samples = 15000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Devnagari-Script"}:
                # Devnagari-Script with 5000 samples takes 20,000 seconds
                max_samples = 5000  # crashes or fit time too long without subsampling
            est = Pipeline([("p", p), ("est", APLRClassifier(**aplr_params))])
        else:
            raise Exception(f"Unrecognized classification method name {trial.method}")
    elif trial.task.problem == "regression":
        if trial.method == "ebm":
            for param, val in ebm_params.copy().items():
                try:
                    set_option(param, val)
                    del ebm_params[param]
                except:
                    pass
            est = ExplainableBoostingRegressor(**ebm_params)
        elif trial.method == "ebm_opt":
            for param, val in ebm_params.copy().items():
                try:
                    set_option(param, val)
                    del ebm_params[param]
                except:
                    pass
            if trial.task.name in {"Allstate_Claims_Severity"}:
                # TODO: tweak
                max_samples = 5000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                # TODO: tweak
                max_samples = 50000  # crashes or fit time too long without subsampling
            if trial.task.name in {"nyc-taxi-green-dec-2016"}:
                # TODO: tweak
                max_samples = 20000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Buzzinsocialmedia_Twitter"}:
                max_samples = 2000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Yolanda"}:
                max_samples = 2000  # crashes or fit time too long without subsampling

            # TODO: these two new ones need to be ranged
            if trial.task.name in {"Santander_transaction_value"}:
                # TODO: determine
                max_samples = 1000  # crashes or fit time too long without subsampling
            if trial.task.name in {"pol"}:
                # TODO: determine
                max_samples = 1000  # crashes or fit time too long without subsampling
            
            # TODO: change these optimization parameters
            param_grid = {
                'smoothing_rounds': optuna.distributions.IntDistribution(1, 4000, log=True),
                'interactions': optuna.distributions.FloatDistribution(0.0, 0.999),
                'inner_bags': optuna.distributions.IntDistribution(0, 0, step=50),  # would prefer 50
                'max_bins': optuna.distributions.IntDistribution(256, 65536, log=True),
                'max_interaction_bins': optuna.distributions.IntDistribution(8, 128, log=True),
                'greedy_ratio': optuna.distributions.FloatDistribution(0.0001, 4.0),
                'cyclic_progress': optuna.distributions.FloatDistribution(0.0, 1.0),
                'outer_bags': optuna.distributions.IntDistribution(14, 14),  # would prefer more
                'interaction_smoothing_rounds': optuna.distributions.IntDistribution(1, 500, log=True),
                'learning_rate': optuna.distributions.FloatDistribution(0.0025, 0.5, log=True),
                'max_leaves': optuna.distributions.IntDistribution(2, 5),
                'min_samples_leaf': optuna.distributions.IntDistribution(2, 100, log=True),
                'min_hessian': optuna.distributions.FloatDistribution(0.000001, 10.0, log=True),
                'max_rounds': optuna.distributions.IntDistribution(25000, 25000),
                'early_stopping_rounds': optuna.distributions.IntDistribution(50, 50),
                'early_stopping_tolerance': optuna.distributions.FloatDistribution(1e-10, 1e-5, log=True),
                'validation_size': optuna.distributions.FloatDistribution(0.1, 0.5),
            }
            est = OptunaSearchCV(
                estimator=ExplainableBoostingRegressor(**ebm_params),
                param_distributions=param_grid,
                cv=n_calibration_folds,
                n_trials=50,
                scoring='neg_mean_squared_error',
                verbose=0,
                random_state=seed,
                n_jobs=1  # EBM uses the cores efficiently
            )
        elif trial.method == "xgb":
            est = XGBRegressor(**xgb_params)
            fit_params["verbose"] = False
        elif trial.method == "xgb_opt":
            if trial.task.name in {"Allstate_Claims_Severity"}:
                # TODO: tweak
                max_samples = 100000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                # TODO: tweak
                max_samples = 100000  # crashes or fit time too long without subsampling
            if trial.task.name in {"nyc-taxi-green-dec-2016"}:
                # TODO: tweak
                max_samples = 100000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Buzzinsocialmedia_Twitter"}:
                # TODO: tweak
                max_samples = 100000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Yolanda"}:
                # TODO: tweak
                max_samples = 50000  # crashes or fit time too long without subsampling

            # from https://github.com/optuna/optuna-examples/blob/main/xgboost/xgboost_cv.py
            # TODO: change and harmonize these optimization parameters
            param_grid = {
                'n_estimators': optuna.distributions.IntDistribution(50, 2000, log=True),
                'max_depth': optuna.distributions.IntDistribution(1, 9),
                'learning_rate': optuna.distributions.FloatDistribution(0.005, 0.5, log=True),
                'gamma': optuna.distributions.FloatDistribution(1e-8, 1.0, log=True),
                'min_child_weight': optuna.distributions.FloatDistribution(2, 10),
                'subsample': optuna.distributions.FloatDistribution(0.2, 1.0),
                'colsample_bytree': optuna.distributions.FloatDistribution(0.2, 1.0),
                'reg_alpha': optuna.distributions.FloatDistribution(1e-8, 1.0, log=True),
                'reg_lambda': optuna.distributions.FloatDistribution(1e-8, 1.0, log=True),
                'grow_policy': optuna.distributions.CategoricalDistribution(["depthwise", "lossguide"]),
            }
            est = OptunaSearchCV(
                estimator=XGBRegressor(**xgb_params),
                param_distributions=param_grid,
                cv=n_calibration_folds,
                n_trials=50,
                scoring='neg_mean_squared_error',
                verbose=0,
                random_state=seed,
                n_jobs=1  # catboost uses the cores efficiently
            )
            fit_params["verbose"] = False
        elif trial.method == "lgbm":
            est = LGBMRegressor(**lgbm_params)
            fit_params["categorical_feature"] = cat_cols
        elif trial.method == "lgbm_opt":
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                max_samples = 2000000  # crashes or fit time too long without subsampling

            # TODO: change and harmonize these optimization parameters
            param_grid = {
                'num_leaves': optuna.distributions.IntDistribution(2, 256, log=True),
                'max_depth': optuna.distributions.IntDistribution(-1, 30),
                'learning_rate': optuna.distributions.FloatDistribution(0.005, 0.5, log=True),
                'n_estimators': optuna.distributions.IntDistribution(50, 2000, log=True),
                'min_child_samples': optuna.distributions.IntDistribution(2, 100),
                'subsample_freq': optuna.distributions.IntDistribution(1, 1),
                'subsample': optuna.distributions.FloatDistribution(0.4, 1.0),
                'colsample_bytree': optuna.distributions.FloatDistribution(0.4, 1.0),
                'reg_alpha': optuna.distributions.FloatDistribution(1e-8, 10.0, log=True),
                'reg_lambda': optuna.distributions.FloatDistribution(1e-8, 10.0, log=True)
            }
            est = OptunaSearchCV(
                estimator=LGBMRegressor(**lgbm_params),
                param_distributions=param_grid,
                cv=n_calibration_folds,
                n_trials=50,
                scoring='neg_mean_squared_error',
                verbose=0,
                random_state=seed,
                n_jobs=1  # lGBM uses the cores efficiently
            )
            fit_params["categorical_feature"] = cat_cols
        elif trial.method == "catb":
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                max_samples = 5000000  # OOM crashes without subsampling
            est = CatBoostRegressor(**catboost_params)
            fit_params["cat_features"] = cat_cols
        elif trial.method == "catb_opt":
            if trial.task.name in {"Allstate_Claims_Severity"}:
                # TODO: tweak
                max_samples = 8000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                # TODO: tweak
                max_samples = 100000  # crashes or fit time too long without subsampling
            if trial.task.name in {"nyc-taxi-green-dec-2016"}:
                # TODO: tweak
                max_samples = 50000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Buzzinsocialmedia_Twitter"}:
                # TODO: tweak
                max_samples = 5000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Yolanda"}:
                # TODO: tweak
                max_samples = 5000  # crashes or fit time too long without subsampling

            # from https://forecastegy.com/posts/catboost-hyperparameter-tuning-guide-with-optuna/
            # TODO: change and harmonize these optimization parameters
            param_grid = {
                'learning_rate': optuna.distributions.FloatDistribution(1e-3, 0.1, log=True),
                'depth': optuna.distributions.IntDistribution(1, 10),
                'colsample_bylevel': optuna.distributions.FloatDistribution(0.05, 1.0),
                'min_data_in_leaf': optuna.distributions.IntDistribution(1, 100),
            }
            est = OptunaSearchCV(
                estimator=CatBoostRegressor(**catboost_params),
                param_distributions=param_grid,
                cv=n_calibration_folds,
                n_trials=50,
                scoring='neg_mean_squared_error',
                verbose=0,
                random_state=seed,
                n_jobs=1  # catboost uses the cores efficiently
            )
            fit_params["cat_features"] = cat_cols
        elif trial.method == "rf_xgb":
            est = XGBRFRegressor(**rf_xgb_params)
            fit_params["verbose"] = False
        elif trial.method == "rf_sk":
            if trial.task.name in {"Allstate_Claims_Severity"}:
                max_samples = 200000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                max_samples = 500000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Buzzinsocialmedia_Twitter"}:
                max_samples = 200000  # OOM crashes without subsampling (583,250 samples originally)
            est = Pipeline([("p", p), ("est", RandomForestRegressor(**rf_sk_params))])
        elif trial.method == "ert":
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                max_samples = 300000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Allstate_Claims_Severity"}:
                max_samples = 100000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Buzzinsocialmedia_Twitter"}:
                max_samples = 200000  # OOM crashes without subsampling (583,250 samples originally)
            if trial.task.name in {"Yolanda"}:
                max_samples = 200000  # OOM crashes without subsampling (400,000 samples originally)
            est = Pipeline([("p", p), ("est", ExtraTreesRegressor(**ert_params))])
        elif trial.method == "tree":
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                max_samples = 2000000  # fit time too long without subsampling
            est = Pipeline([("p", p), ("est", DecisionTreeRegressor(**tree_params))])
        elif trial.method == "elastic":
            est = Pipeline([("p", p), ("est", ElasticNet(**elastic_params))])
        elif trial.method == "sgd":
            est = Pipeline([("p", p), ("est", SGDRegressor(**sgd_params))])
        elif trial.method == "lm":
            est = Pipeline([("p", p), ("est", LinearRegression(**lm_params))])
        elif trial.method == "lsvm":
            est = Pipeline([("p", p), ("est", LinearSVR(**lsvm_params))])
        elif trial.method == "svm":
            if trial.task.name in {"Allstate_Claims_Severity"}:
                max_samples = 100000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                max_samples = 100000  # crashes or fit time too long without subsampling
            if trial.task.name in {"nyc-taxi-green-dec-2016"}:
                max_samples = 150000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Buzzinsocialmedia_Twitter"}:
                max_samples = 300000  # crashes or fit time too long without subsampling
            if trial.task.name in {"Yolanda"}:
                # TODO: tweak. Some exit quicker, but others take LOOOONG.
                max_samples = 200000  # crashes or fit time too long without subsampling
            est = Pipeline([("p", p), ("est", SVR(**svm_params))])
        elif trial.method == "nn":
            est = Pipeline([("p", p), ("est", MLPRegressor(**nn_params))])
        elif trial.method == "knn":
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                max_samples = 100000  # crashes or fit time too long without subsampling
            est = Pipeline([("p", p), ("est", KNeighborsRegressor(**knn_params))])
        elif trial.method == "aplr":
            p.sparse_threshold = 0  # APLR only accepts dense data
            if trial.task.name in {"Airlines_DepDelay_10M"}:
                max_samples = 100000  # OOM crashes without subsampling
            if trial.task.name in {"nyc-taxi-green-dec-2016"}:
                max_samples = 300000  # OOM crashes without subsampling
            est = Pipeline([("p", p), ("est", APLRRegressor(**aplr_params))])
        else:
            raise Exception(f"Unrecognized regression method name {trial.method}")
    else:
        raise Exception(f"Unrecognized problem {trial.task.problem}")

    if max_samples is None:
        pass
    elif max_samples < len(fit_params["y"]):
        # subsample because the ML method crashes or takes too long (more than 15,000 seconds)
        _, fit_params["X"], _, fit_params["y"] = train_test_split(fit_params["X"], fit_params["y"], test_size=max_samples, random_state=seed)
        _ = None  # free the _ variable to make more room
    else:
        print(f"Ignoring max_sample of {max_samples} since there are {len(fit_params['y'])} training samples.")

    global global_counter
    try:
        global_counter += 1
    except NameError:
        global_counter = 0
    
    # Train
    print(f"FIT: {global_counter}, {trial.task.origin}, {trial.task.name}, {trial.method}, {trial.meta}, classes:{trial.task.n_classes}, features:{fit_params['X'].shape[1]}, train_samples:{fit_params['X'].shape[0]}, orig_samples:{trial.task.n_samples}")
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        gc.collect()  # clean out garbage to have as much memory available as possible
        start_time = time()
        est.fit(**fit_params)
        end_time = time()
    trial.log("fit_time", end_time - start_time)

    if isinstance(est, OptunaSearchCV):
        trial.log("opt", est.best_params_)

    if isinstance(est, (ExplainableBoostingClassifier, ExplainableBoostingRegressor)):
        trial.log("iterations", re.sub(r'\s+', ' ', np.array_str(est.best_iteration_)).replace('[ ', '[').replace('] [',']['))
    
    # clean out garbage to have as much memory available as possible
    del fit_params
    gc.collect()
    
    if trial.task.problem == "regression":
        start_time = time()
        predictions = est.predict(X_test)
        end_time = time()
        trial.log("pred_time", end_time - start_time)

        # Use NRMSE-IQR (normalized root mean square error by the interquartile range)
        # so that datasets with large predicted values do not dominate the benchmark
        # and the range is not sensitive to outliers. The rank is identical to RMSE.
        # https://en.wikipedia.org/wiki/Root_mean_square_deviation

        rmse = root_mean_squared_error(y_test, predictions)
        trial.log("rmse", rmse)
        trial.log("nrmse", rmse / interquartile_range)
        trial.log("r2", r2_score(y_test, predictions))
        trial.log("mae", mean_absolute_error(y_test, predictions))
        trial.log("mape", mean_absolute_percentage_error(y_test, predictions))
        trial.log("medae", median_absolute_error(y_test, predictions))
    else:
        start_time = time()
        predictions = est.predict_proba(X_test)
        end_time = time()
        trial.log("pred_time", end_time - start_time)

        if trial.task.problem == "binary":
            predictions = predictions[:,1]
            trial.log("logloss", log_loss(y_test, predictions))
            trial.log("auc", roc_auc_score(y_test, predictions))
            trial.log("brier", brier_score_loss(y_test, predictions))
            predictions = (0.5 < predictions).astype(np.int16)
            trial.log("precision", precision_score(y_test, predictions, zero_division=0.0))
            trial.log("recall", recall_score(y_test, predictions, zero_division=0.0))
            trial.log("accuracy", accuracy_score(y_test, predictions))
            trial.log("bal_acc", balanced_accuracy_score(y_test, predictions))
        else:
            trial.log("xent", log_loss(y_test, predictions))
            trial.log('ovo_auc', roc_auc_score(y_test, predictions, multi_class='ovo', average='weighted'))
            trial.log('ovr_auc', roc_auc_score(y_test, predictions, multi_class='ovr'))
            # TODO: add multiclass brier_score once scikit-learn supports it (open PR now)
            predictions = np.argmax(predictions, axis=1)
            trial.log("mprecision", precision_score(y_test, predictions, average='weighted', zero_division=0.0))
            trial.log("mrecall", recall_score(y_test, predictions, average='weighted', zero_division=0.0))
            trial.log("maccuracy", accuracy_score(y_test, predictions))
            trial.log("mbal_acc", balanced_accuracy_score(y_test, predictions))

In [None]:
from powerlift.bench import populate_with_datasets, retrieve_openml_cc18, retrieve_openml_automl_regression
from powerlift.bench import retrieve_openml_automl_classification, retrieve_catboost_50k, retrieve_pmlb
from powerlift.executors import LocalMachine, AzureContainerInstance
from itertools import chain

cache_dir="~/.powerlift"
data_retrieval = chain(
    retrieve_openml_cc18(cache_dir=cache_dir),
    retrieve_openml_automl_regression(cache_dir=cache_dir),
    # retrieve_openml_automl_classification(cache_dir=cache_dir),
    # retrieve_catboost_50k(cache_dir=cache_dir),
    # retrieve_pmlb(cache_dir=cache_dir),
)

# This downloads datasets once and feeds into the database.
populate_with_datasets(store, data_retrieval, exist_ok=exist_ok)

if is_azure:
    executor = AzureContainerInstance(
        store, azure_tenant_id, subscription_id, azure_client_id, credential,
        resource_group=resource_group,
        pip_install=requirements + " interpret-core",
        wheel_filepaths=wheel_filepaths,
        n_instances=n_instances,
        image="mcr.microsoft.com/devcontainers/python:3.12",
    )
    benchmark.run(trial_runner, trial_filter, timeout=TIMEOUT_SEC, n_replicates=n_replicates, executor=executor)
else:
    benchmark.run(trial_runner, trial_filter, n_replicates=n_replicates, executor=LocalMachine(store, debug_mode=True))

In [None]:
#benchmark.wait_until_complete()

In [None]:
from datetime import datetime
results_df = benchmark.results()
results_df.to_csv(f"{experiment_name}.csv", index=None)

status_df = benchmark.status()
status_df["start_time"] = (datetime.utcnow() - status_df["start_time"]).dt.total_seconds() / 60.0
print(status_df['status'].value_counts().to_string(index=True, header=False))
print()
if status_df["errmsg"].notna().any():
    cols=["task", "method", "meta", "errmsg", "n_samples", "n_features", "n_classes", "total_categories"]
    print(status_df[status_df["errmsg"].notna()].reindex(columns=cols).to_string(index=False))
    print()
    for errmsg in status_df["errmsg"]:
        if errmsg is not None:
            print("ERROR: " + str(errmsg))
    print()
if (0 <= status_df["runner_id"]).any():
    cols=["runner_id", "task", "method", "meta", "replicate_num", "start_time", "n_samples", "n_features", "n_classes", "total_categories"]
    print(status_df[0 <= status_df["runner_id"]].sort_values(by='start_time', ascending=False).reindex(columns=cols).to_string(index=False))

In [None]:
import pandas as pd
import os

pd.set_option('display.float_format', '{:.6f}'.format) 

# reload if analyzing later
results_df = pd.read_csv(f'{experiment_name}.csv')
print(f'Results (pre-filtered) count: {results_df.shape[0]}')

# Optionally filter out results we want to replace
#results_df = results_df[results_df['method'] != 'ebm']
#results_df = results_df[(results_df['method'] != 'ebm') | (results_df['meta'] != '{}')]
#results_df = results_df[(results_df['method'] != 'ebm') | (results_df['meta'] != '{"interactions": 0}')]
print(f'Results (post-filtered) count: {results_df.shape[0]}')

# Fill in results from previous runs if desired.
basefile = 'base.csv'
if os.path.exists(basefile):
    filler_df = pd.DataFrame(columns=results_df.columns)
    filler_df = pd.read_csv(basefile)
    
    # Optionally filter out results from the filter
    filler_df = filler_df[filler_df['method'] != 'ebm']
    #filler_df = filler_df[(filler_df['method'] != 'ebm') | (filler_df['meta'] != '{}')]
    #filler_df = filler_df[(filler_df['method'] != 'ebm') | (filler_df['meta'] != '{"interactions": 0}')]
    
    key_columns = ['task', 'method', 'meta', 'replicate_num', 'name', 'seq_num']
    filler_df = filler_df[~filler_df.set_index(key_columns).index.isin(results_df.set_index(key_columns).index)]
    if 0 < filler_df.shape[0]:
        results_df = pd.concat([results_df, filler_df], ignore_index=True)
        results_df = results_df.sort_values(by=["task", "method", "meta", "replicate_num", "name", "seq_num"])
        results_df.to_csv("merged.csv", index=None)
    print(f'Filter count: {filler_df.shape[0]}')
    print(f'Results count: {results_df.shape[0]}')
    #print(filler_df.to_string())

types_df = results_df[results_df['name'].isin(['auc', 'ovo_auc', 'nrmse'])]
task_to_type = types_df.groupby('task')['name'].first().map({'auc': 'binary', 'ovo_auc': 'multiclass', 'nrmse': 'regression'})
results_df['type'] = results_df['task'].map(task_to_type).fillna('')

flip = ['r2', 'auc', 'precision', 'recall', 'accuracy', 'bal_acc', 'ovo_auc', 'ovr_auc', 'mprecision', 'mrecall', 'maccuracy', 'mbal_acc']
condition = results_df['name'].isin(flip)
results_df.loc[condition, 'num_val'] = -results_df.loc[condition, 'num_val']

# Optionally filter out any incomplete datasets
results_df = results_df[results_df['task'] != 'Devnagari-Script']
results_df = results_df[results_df['task'] != 'mnist_784']
results_df = results_df[results_df['task'] != 'isolet']
#results_df = results_df[results_df['task'] != 'CIFAR_10']
#results_df = results_df[results_df['task'] != 'Airlines_DepDelay_10M']
#results_df = results_df[results_df['task'] != 'Fashion-MNIST']
#results_df = results_df[results_df['task'] != 'har']
#results_df = results_df[results_df['task'] != 'cnae-9']
#results_df = results_df[results_df['task'] != 'MiceProtein']
#
#results_df = results_df[results_df['type'] != 'binary']
#results_df = results_df[results_df['type'] != 'multiclass']
#results_df = results_df[results_df['type'] != 'regression']
#
#results_df = results_df[(results_df['method'] != 'ebm') | (results_df['meta'] == '{"interactions": 0}') | (results_df['meta'] == '{}')]
#results_df = results_df[((results_df['method'] == 'ebm') & (results_df['meta'] == '{}')) | (results_df['method'] == 'xgb')]
#
#results_df = results_df[
#    (results_df['task'] == 'CIFAR_10') | 
#    (results_df['task'] == 'Fashion-MNIST') | 
#    (results_df['task'] == 'har') | 
#    (results_df['task'] == 'mnist_784') | 
#    (results_df['task'] == 'isolet') | 
#    (results_df['task'] == 'Allstate_Claims_Severity') | 
#    (results_df['task'] == 'Airlines_DepDelay_10M') | 
#    (results_df['task'] == 'Buzzinsocialmedia_Twitter') | 
#    (results_df['task'] == 'nyc-taxi-green-dec-2016') | 
#    (results_df['task'] == 'cnae-9') | 
#    (results_df['task'] == 'Santander_transaction_value') | 
#    (results_df['task'] == 'Yolanda') |
#    (results_df['task'] == 'Bioresponse')
#]
print(f'Final count: {results_df.shape[0]}')

In [None]:
averages = results_df.groupby(['method', 'meta', 'name'])['num_val'].mean().unstack().reset_index()

metric_ranks = results_df.pivot_table('num_val', ['task', 'name'], ['method', 'meta', 'replicate_num'])
metric_ranks = metric_ranks.rank(axis=1, ascending=True, method='min')
metric_ranks = metric_ranks.stack(level='replicate_num', future_stack=True)
metric_ranks = metric_ranks.groupby('name').mean().transpose()
metric_ranks.columns = [f"{col}_RANK" for col in metric_ranks.columns]
metric_ranks = metric_ranks.reset_index()

overall_rank = results_df[results_df['name'].isin(['auc', 'ovo_auc', 'nrmse'])]
overall_rank = overall_rank.pivot_table('num_val', 'task', ['method', 'meta', 'replicate_num'])
overall_rank = overall_rank.rank(axis=1, ascending=True, method='min')
overall_rank = overall_rank.stack(level='replicate_num', future_stack=True)
overall_rank = overall_rank.mean()
overall_rank = overall_rank.to_frame(name='RANK').reset_index()

desired_columns = ['method', 'meta', 'RANK', 'auc', 'ovo_auc', 'nrmse', 'auc_RANK', 'ovo_auc_RANK', 'nrmse_RANK', 'fit_time', 'pred_time']
combined_df = averages.merge(metric_ranks, on=['method', 'meta']).merge(overall_rank, on=['method', 'meta'])
combined_df = combined_df.sort_values(by='RANK')
combined_df = combined_df.reindex(columns=desired_columns)

print("METHOD METRICS:\n")
print(combined_df.to_string(index=False))

In [None]:
desired_columns = ['method', 'meta', 'RANK', 'auc', 'ovo_auc', 'nrmse', 'fit_time', 'pred_time']
row_order = combined_df[['method', 'meta']]

counts = results_df.groupby(['method', 'meta', 'name']).size().unstack()
counts = counts.reindex(row_order, axis=0).reset_index()
counts['RANK'] = 0
if 'auc' in counts.columns:
    counts['RANK'] += counts['auc']
if 'ovo_auc' in counts.columns:
    counts['RANK'] += counts['ovo_auc']
if 'nrmse' in counts.columns:
    counts['RANK'] += counts['nrmse']
counts = counts.reindex(columns=desired_columns)
counts = counts.fillna(0)
count_columns = counts.columns.drop(['method', 'meta'])
counts[count_columns] = counts[count_columns].astype(int)
print("METHOD COUNTS:\n")
print(counts.to_string(index=False))

In [None]:
filtered_df = results_df[results_df['name'].isin(['auc', 'ovo_auc', 'rmse'])]
grouped = filtered_df.groupby(['task', 'method', 'meta', 'type'])['num_val'].agg(['mean']).reset_index()
pivot_table = grouped.pivot_table(index=['task', 'type'], columns=['method', 'meta'], values=['mean'])
pivot_table.columns = pivot_table.columns.droplevel(0)
pivot_table = pivot_table.reindex(row_order, axis=1)

ratio_column = ('ebm', '{}')
if ratio_column in pivot_table.columns:
    non_ebm_columns = [col for col in pivot_table.columns if col[0] != ratio_column[0]]
    ratio_col = (pivot_table[ratio_column] / pivot_table[non_ebm_columns].min(axis=1)).values
    ratio_col[pivot_table[ratio_column] < 0] *= -1
    pivot_table = pivot_table.reset_index(-1)
    pivot_table.insert(0, 'ratio', ratio_col)
    pivot_table = pivot_table.sort_values(by=['type', 'ratio'])
else:
    pivot_table = pivot_table.reset_index(-1)
    pivot_table = pivot_table.sort_values(by=['type', 'task'])
task_order = pivot_table.index
pivot_table = pivot_table.reset_index()
print("TASK MEAN (auc, ovo_auc, rmse):\n")
print(pivot_table.to_string())

In [None]:
filtered_df = results_df[results_df['name'].isin(['auc', 'ovo_auc', 'rmse'])]
grouped = filtered_df.groupby(['task', 'method', 'meta', 'type'])['num_val'].agg(['std']).reset_index()
pivot_table = grouped.pivot_table(index=['task', 'type'], columns=['method', 'meta'], values=['std'])
pivot_table.columns = pivot_table.columns.droplevel(0)
pivot_table = pivot_table.reindex(row_order, axis=1)
pivot_table = pivot_table.reset_index(-1).reindex(task_order, axis=0).reset_index()
print("TASK STANDARD DEVIATION (auc, ovo_auc, rmse):\n")
print(pivot_table.to_string())

In [None]:
filtered_df = results_df[results_df['name'].isin(['auc', 'ovo_auc', 'rmse'])]
grouped = filtered_df.groupby(['task', 'method', 'meta', 'type'])['num_val'].agg(['count']).reset_index()
pivot_table = grouped.pivot_table(index=['task', 'type'], columns=['method', 'meta'], values=['count'])
pivot_table = pivot_table.fillna(0).astype(int)
pivot_table.columns = pivot_table.columns.droplevel(0)
pivot_table = pivot_table.reindex(row_order, axis=1)
pivot_table = pivot_table.reset_index(-1).reindex(task_order, axis=0).reset_index()
print("TASK COUNT:\n")
print(pivot_table.to_string())

In [None]:
filtered_df = results_df[results_df['name'].isin(['fit_time', 'pred_time'])]
grouped = filtered_df.groupby(['task', 'method', 'meta', 'type', 'name'])['num_val'].agg(['mean']).reset_index()
pivot_table = grouped.pivot_table(index=['task', 'method', 'meta', 'type'], columns=['name'], values=['mean'])
pivot_table.columns = pivot_table.columns.droplevel(0)  # drop the compound column term "mean"
pivot_table = pivot_table.dropna(subset=['fit_time', 'pred_time'])
times = pivot_table['fit_time'] + pivot_table['pred_time']
times = times.to_frame(name='time')
pivot_table = times.pivot_table(index=['task', 'type'], columns=['method', 'meta'], values=['time'])
pivot_table.columns = pivot_table.columns.droplevel(0)  # drop the compound column term "time"
pivot_table = pivot_table.reindex(row_order, axis=1)
pivot_table = pivot_table.reset_index(-1).reindex(task_order, axis=0).reset_index()
print("TASK TIME (fit_time + pred_time):\n")
print(pivot_table.to_string())

In [None]:
fit_times = results_df[results_df['name'] == 'fit_time']
fit_times = fit_times.pivot_table('num_val', 'task', ['method', 'meta'])
fit_times = fit_times.dropna()
fit_times["ratios"] = fit_times[('ebm', '{}')] / fit_times[('xgb', '{}')]
import numpy as np
fit_times_deciles = np.percentile(fit_times["ratios"], [90, 80, 70, 60, 50, 40, 30, 20, 10])
fit_times_deciles = [f"{decile:.2f}  " for decile in fit_times_deciles]
max_ratio= fit_times["ratios"].max()
min_ratio= fit_times["ratios"].min()
print("FIT TIME RATIO DECILES:\n")
print(*fit_times_deciles)
print(f"max: {max_ratio:.2f}")
print(f"min: {min_ratio:.2f}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import ast

n_histogram_bins = 20
log_scale_params = [
    'smoothing_rounds', 'max_bins', 'max_interaction_bins',
    'interaction_smoothing_rounds', 'learning_rate',
    'min_samples_leaf', 'min_hessian', 'early_stopping_tolerance'
]

ebm_opt_df = results_df[(results_df['method'] == 'ebm_opt') & (results_df['name'] == 'opt')]['json_val']
json_dicts = ebm_opt_df.apply(lambda x: ast.literal_eval(x))

avg_dict = {}

for d in json_dicts:
    for key, value in d.items():
        if key in avg_dict:
            avg_dict[key] += value / len(json_dicts)
        else:
            avg_dict[key] = value / len(json_dicts)

print(avg_dict)

for key in avg_dict.keys():
    values = [d[key] for d in json_dicts]
    low =  min(values)
    high = max(values)

    plt.figure(figsize=(10, 5))

    if key in log_scale_params:
        bins = np.logspace(np.log10(low), np.log10(high), n_histogram_bins)
        plt.hist(values, bins=bins, alpha=0.75)
        plt.xscale('log')
    else:
        plt.hist(values, bins=n_histogram_bins, alpha=0.75)

    plt.title(f'Histogram of {key}')
    plt.xlabel(key)
    plt.ylabel('Frequency')
    plt.grid(True)

    plt.axvline(x=low, color='r', linestyle='--', label=f'Low: {low}')
    plt.axvline(x=high, color='g', linestyle='--', label=f'High: {high}')
    plt.legend()

    plt.show()