In [None]:
# use exact versions of these in order to preserve RANK ordering better
!pip install -U --quiet numpy==1.26.4 pandas==2.2.2 scikit-learn==1.5.1 xgboost==2.1.0 lightgbm==4.5.0 catboost==1.2.5 aplr==10.6.1

In [None]:
# install interpret if not already installed
try:
    import interpret
except ModuleNotFoundError:
    !pip install -U --quiet interpret-core

In [None]:
# install powerlift if not already installed
try:
    import powerlift
except ModuleNotFoundError:
    !pip install -U --quiet powerlift[datasets,postgres]

In [None]:
def trial_filter(task):
    min_samples = 1
    max_samples = 1000000000000
    min_features = 1
    max_features = 1000000000000
    if task.scalar_measure("n_rows") < min_samples:
        return []
    if max_samples < task.scalar_measure("n_rows"):
        return []
    if task.scalar_measure("n_cols") < min_features:
        return []
    if max_features < task.scalar_measure("n_cols"):
        return []

    
    if task.origin == "openml_automl_regression":
        pass  # include in benchmark
    elif task.origin == "openml_automl_classification":
        return []
    elif task.origin == "openml_cc18":
        pass  # include in benchmark
    elif task.origin == "pmlb":
        if task.problem == "binary":
            return []
        elif task.problem == "multiclass":
            return []
        elif task.problem == "regression":
            return []
        else:
            raise Exception(f"Unrecognized problem {task.problem}")
    else:
        raise Exception(f"Unrecognized origin {task.origin}")

    
    exclude_set = set()
#    exclude_set = set(['isolet', 'Devnagari-Script', 'CIFAR_10', 'Airlines_DepDelay_10M'])
#    exclude_set = set([
#        'Fashion-MNIST', 'mfeat-pixel', 'Bioresponse',
#        'mfeat-factors', 'isolet', 'cnae-9', "Internet-Advertisements",
#        'har', 'Devnagari-Script', 'mnist_784', 'CIFAR_10',
#        'Airlines_DepDelay_10M',
#    ])
    if task.name in exclude_set:
        return []


    # exclude duplicates of a dataset if they appear twice
    global global_duplicates
    try:
        duplicates = global_duplicates
    except NameError:
        duplicates = set()
        global_duplicates = duplicates
    key = (task.name, task.scalar_measure("n_rows"), task.scalar_measure("n_cols"))
    if key in duplicates:
        print(f"Excluding duplicate: {key}")
        return []
    else:
        duplicates.add(key)


    return [
        "ebm-base",
        "xgboost-base",
        "aplr-base",
        # "lightgbm-base",
        # "catboost-base",
    ]

In [None]:
def trial_runner(trial):
    seed=42
    max_interaction_features=1000
    ebm_base_params = {}
    xgb_base_params = {}
    lightgbm_base_params = {}
    catboost_base_params = {}
    # ebm_base_params = {"max_rounds":2, "interactions":0}
    # xgb_base_params = {"n_estimators":1}
    # lightgbm_base_params = {"n_estimators":1}
    # catboost_base_params = {"n_estimators":1}

    if max_interaction_features < trial.task.scalar_measure("n_cols"):
        # TODO: EBMs can crash for now with too many interactions, so limit it until we have better fix
        ebm_base_params["interactions"] = 0

    from xgboost import XGBClassifier, XGBRegressor
    from lightgbm import LGBMClassifier, LGBMRegressor
    from catboost import CatBoostClassifier, CatBoostRegressor
    from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
    from aplr import APLRClassifier, APLRRegressor
    from sklearn.metrics import roc_auc_score, root_mean_squared_error, log_loss
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline
    import numpy as np
    from time import time
    import warnings

    X, y, meta = trial.task.data(["X", "y", "meta"])

    for col in X.columns:
        # catboost doesn't like missing categoricals, so make them a category
        col_data = X[col]
        if str(col_data.dtype) == "category" and col_data.isnull().any():
            X[col] = col_data.cat.add_categories('NaN').fillna('NaN')
    
    categoricals = meta["categorical_mask"]
    categorical_ints = [i for i, val in enumerate(categoricals) if val]
    
    # XGB and EBM already handle this via CategoricalDtype but make it clear
    xgb_feature_types = ["c" if cat else "q" for cat in categoricals]
    ebm_feature_types = ["nominal" if cat else "continuous" for cat in categoricals]

    stratification = None
    if trial.task.problem in ["binary", "multiclass"]:
        # stratification = y
        pass  # Re-enable stratification if dataset fails from absent class in train/test sets (PMLB)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=stratification, random_state=seed)

    # Build preprocessor
    is_cat = meta["categorical_mask"]
    cat_cols = [idx for idx in range(X.shape[1]) if is_cat[idx]]
    num_cols = [idx for idx in range(X.shape[1]) if not is_cat[idx]]
    cat_ohe_step = ("ohe", OneHotEncoder(sparse_output=True, handle_unknown="ignore"))
    cat_pipe = Pipeline([cat_ohe_step])
    num_pipe = Pipeline([("identity", FunctionTransformer())])
    transformers = [("cat", cat_pipe, cat_cols), ("num", num_pipe, num_cols)]
    ct = Pipeline(
        [
            ("ct", ColumnTransformer(transformers=transformers, sparse_threshold=0)),
            (
                "missing",
                SimpleImputer(add_indicator=True, strategy="most_frequent"),
            ),
        ]
    )

    # Specify method
    if trial.task.problem in ["binary", "multiclass"]:
        if trial.method.name == "ebm-base":
            est = ExplainableBoostingClassifier(feature_types=ebm_feature_types, **ebm_base_params)
            fit_params = {"X":X_train, "y":y_train}
        elif trial.method.name == "xgboost-base":
            est = XGBClassifier(enable_categorical=True, feature_types=xgb_feature_types, **xgb_base_params)
            fit_params = {"X":X_train, "y":y_train, "verbose": False}
        elif trial.method.name == "aplr-base":
            est = Pipeline(
                [
                    ("ct", ct),
                    (
                        "est",
                        APLRClassifier(m=3000),
                    ),
                ]
            )
            y_train = y_train.astype(str).to_numpy()
            y_test = y_test.astype(str).to_numpy()
            fit_params = {"X":X_train, "y":y_train}
        elif trial.method.name == "lightgbm-base":
            est = LGBMClassifier(verbosity=-1, **lightgbm_base_params)
            fit_params = {"X":X_train, "y":y_train, "categorical_feature": categorical_ints}
        elif trial.method.name == "catboost-base":
            est = CatBoostClassifier(verbose=False, **catboost_base_params)
            fit_params = {"X":X_train, "y":y_train, "cat_features": categorical_ints}
        else:
            raise Exception(f"Unrecognized method name {trial.method.name}")

        predict_fn = est.predict_proba
    elif trial.task.problem == "regression":
        if trial.method.name == "ebm-base":
            est = ExplainableBoostingRegressor(feature_types=ebm_feature_types, **ebm_base_params)
            fit_params = {"X":X_train, "y":y_train}
        elif trial.method.name == "xgboost-base":
            est = XGBRegressor(enable_categorical=True, feature_types=xgb_feature_types, **xgb_base_params)
            fit_params = {"X":X_train, "y":y_train, "verbose": False}
        elif trial.method.name == "aplr-base":
            est = Pipeline(
                [
                    ("ct", ct),
                    (
                        "est",
                        APLRRegressor(m=3000),
                    ),
                ]
            )
            fit_params = {"X":X_train, "y":y_train}
        elif trial.method.name == "lightgbm-base":
            est = LGBMRegressor(verbosity=-1, **lightgbm_base_params)
            fit_params = {"X":X_train, "y":y_train, "categorical_feature": categorical_ints}
        elif trial.method.name == "catboost-base":
            est = CatBoostRegressor(verbose=False, **catboost_base_params)
            fit_params = {"X":X_train, "y":y_train, "cat_features": categorical_ints}
        else:
            raise Exception(f"Unrecognized method name {trial.method.name}")

        predict_fn = est.predict
    else:
        raise Exception(f"Unrecognized problem {trial.task.problem}")

    global global_counter
    try:
        global_counter += 1
    except NameError:
        global_counter = 0
    
    # Train
    print(f"FIT: {global_counter}, {trial.task.origin}, {trial.task.name}, {trial.method.name}, ", end="")
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        start_time = time()
        est.fit(**fit_params)
        elapsed_time = time() - start_time
    trial.log("fit_time", elapsed_time)
    
    # Predict
    start_time = time()
    predictions = predict_fn(X_test)
    elapsed_time = time() - start_time
    trial.log("predict_time", elapsed_time)

    if trial.task.problem == "binary":
        predictions = predictions[:,1]

        eval_score = roc_auc_score(y_test, predictions)
        trial.log("auc", eval_score)

        eval_score2 = log_loss(y_test, predictions)
        trial.log("log_loss", eval_score2)
    elif trial.task.problem == "multiclass":
        eval_score = roc_auc_score(y_test, predictions, average="weighted", multi_class="ovo")
        trial.log("multi_auc", eval_score)

        eval_score2 = log_loss(y_test, predictions)
        trial.log("cross_entropy", eval_score2)
    elif trial.task.problem == "regression":
        # Use NRMSE-IQR (normalized root mean square error by the interquartile range)
        # so that datasets with large predicted values do not dominate the benchmark
        # and the range is not sensitive to outliers. The rank is identical to RMSE.
        # https://en.wikipedia.org/wiki/Root_mean_square_deviation

        # Get quartile_range from the full dataset for consistency across seeds.
        q75, q25 = np.percentile(y, [75, 25])
        interquartile_range = q75 - q25

        eval_score = root_mean_squared_error(y_test, predictions) / interquartile_range
        trial.log("nrmse", eval_score)
    else:
        raise Exception(f"Unrecognized problem {trial.task.problem}")

    print(eval_score)

In [None]:
force_recreate=False
exist_ok=True
is_local=True

import datetime
experiment_name = datetime.datetime.now().strftime("%Y_%m_%d_%H%M__") + "myexperiment"
print("Experiment name: " + experiment_name)

import os
if is_local:
    conn_str = f"sqlite:///{os.getcwd()}/powerlift.db"
else:
    from azure.identity import AzureCliCredential
    credential = AzureCliCredential()
    
    from dotenv import load_dotenv
    load_dotenv()
    experiment_config = {
        # "0.6.3": ["interpret_core-0.6.3-py3-none-any.whl"],
        "0.6.2": ["interpret_core-0.6.2-py3-none-any.whl"],
    }
    TIMEOUT_SEC = 60 * 60 * 6  # 6 hours
    conn_str = os.getenv("DOCKER_DB_URL")
    azure_tenant_id = os.getenv("AZURE_TENANT_ID")
    azure_client_id = os.getenv("AZURE_CLIENT_ID")
    azure_client_secret = os.getenv("AZURE_CLIENT_SECRET")
    subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
    resource_group = os.getenv("AZURE_RESOURCE_GROUP")

from powerlift.bench import retrieve_openml_automl_regression, retrieve_openml_automl_classification, retrieve_openml_cc18, retrieve_catboost_50k, retrieve_pmlb
from powerlift.bench import Benchmark, Store, populate_with_datasets
from powerlift.executors import LocalMachine, AzureContainerInstance
from itertools import chain

# Initialize database (if needed).
store = Store(conn_str, force_recreate=force_recreate)

cache_dir="~/.powerlift"
data_retrieval = chain(
    retrieve_openml_automl_regression(cache_dir=cache_dir),
    # retrieve_openml_automl_classification(cache_dir=cache_dir),
    retrieve_openml_cc18(cache_dir=cache_dir),
    # retrieve_catboost_50k(cache_dir=cache_dir),
    # retrieve_pmlb(cache_dir=cache_dir),
)

# This downloads datasets once and feeds into the database.
populate_with_datasets(store, data_retrieval, exist_ok=exist_ok)

# Run experiment
benchmark = Benchmark(store, name=experiment_name)

if is_local:
    benchmark.run(trial_runner, trial_filter, executor=LocalMachine(store, debug_mode=True))
else:
    for name, wheel_filepaths in experiment_config.items():
        executor = AzureContainerInstance(
            store, azure_tenant_id, azure_client_id, azure_client_secret, subscription_id, resource_group, credential,
            image="benchregistry.azurecr.io/powerlift:0.1.9",
            wheel_filepaths=wheel_filepaths,
            n_running_containers=200, num_cores=4, mem_size_gb=16, delete_group_container_on_complete=True
        )
        benchmark.run(trial_runner, trial_filter, timeout=TIMEOUT_SEC, executor=executor)

In [None]:
benchmark.wait_until_complete()

In [None]:
# re-establish connection
benchmark = Benchmark(conn_str, name=experiment_name)

results_df = benchmark.results()
results_df.to_csv(f"results-{experiment_name}.csv", index=None)

status_df = benchmark.status()
for errmsg in status_df["errmsg"]:
    if errmsg is not None:
        print("ERROR: " + str(errmsg))
print(status_df['status'].value_counts().to_string(index=True, header=False))

In [None]:
import pandas as pd

# reload if analyzing later
results_df = pd.read_csv(f"results-{experiment_name}.csv")

averages = results_df.groupby(['method', 'name'])['num_val'].mean().unstack().reset_index()

metric_ranks = results_df.pivot_table('num_val', ['task', 'name'], 'method')
metric_ranks = metric_ranks.rank(axis=1, ascending=True, method='min')
metric_ranks = metric_ranks.groupby('name').mean().transpose()
metric_ranks.columns = [f"{col}_RANK" for col in metric_ranks.columns]
metric_ranks = metric_ranks.reset_index()

overall_rank = results_df[results_df['name'].isin(['log_loss', 'cross_entropy', 'nrmse'])]
overall_rank = overall_rank.pivot_table('num_val', 'task', 'method')
overall_rank = overall_rank.rank(axis=1, ascending=True, method='min')
overall_rank = overall_rank.mean()
overall_rank = overall_rank.to_frame(name='RANK').reset_index()

desired_columns = ['method', 'RANK', 'auc', 'multi_auc', 'nrmse', 'log_loss_RANK', 'cross_entropy_RANK', 'nrmse_RANK', 'fit_time', 'predict_time']
combined_df = averages.merge(metric_ranks, on='method').merge(overall_rank, on='method')
combined_df = combined_df.sort_values(by='RANK')
combined_df = combined_df.reindex(columns=desired_columns)

print(combined_df.to_string(index=False))

In [None]:
desired_columns = ['method', 'RANK', 'auc', 'multi_auc', 'nrmse', 'log_loss', 'cross_entropy', 'fit_time', 'predict_time']
row_order = combined_df["method"]

counts = results_df.groupby(['method', 'name']).size().unstack()
counts = counts.reindex(row_order, axis=0).reset_index()
counts['RANK'] = 0
if 'log_loss' in counts.columns:
    counts['RANK'] += counts['log_loss']
if 'cross_entropy' in counts.columns:
    counts['RANK'] += counts['cross_entropy']
if 'nrmse' in counts.columns:
    counts['RANK'] += counts['nrmse']
counts = counts.reindex(columns=desired_columns)
print(counts.to_string(index=False))

In [None]:
fit_times = results_df[results_df['name'] == 'fit_time']
fit_times = fit_times.pivot_table('num_val', 'task', 'method')
fit_times = fit_times.dropna()
fit_times["ratios"] = fit_times['ebm-base'] / fit_times['xgboost-base']
import numpy as np
fit_times_deciles = np.percentile(fit_times["ratios"], [90, 80, 70, 60, 50, 40, 30, 20, 10])
fit_times_deciles = [f"{decile:.2f}  " for decile in fit_times_deciles]
max_ratio= fit_times["ratios"].max()
min_ratio= fit_times["ratios"].min()
print("fit time ratio deciles:")
print(*fit_times_deciles)
print(f"max: {max_ratio:.2f}")
print(f"min: {min_ratio:.2f}")