In [63]:
import openml
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

In [64]:
hyperparameters_space_xgboost = {
    "n_estimators": np.arange(1, 1000),
    "max_depth": np.arange(1, 1000),
    "max_leaves": np.arange(0, 10000), # indicates not limit
    'min_child_weight':np.arange(1, 50, 1),
    "grow_policy": ['depthwise', 'lossguide'],
    "learning_rate": np.random.uniform(0, 1, 100),
    "booster": ['gbtree', 'gblinear', 'dart'],
    "gamma": np.random.uniform(0, 1, 100),
    "subsumple": np.random.uniform(0, 1, 100),
    "colsample_bytree": np.random.uniform(0, 1, 100),
    "reg_alpha": [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100],
    "reg_lambda": [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100],
}

In [65]:
def randomly_choose_hyperparameters(hyperparameters_space):
    hyperparameters = {}
    for key, value in hyperparameters_space.items():
        hyperparameters[key] = np.random.choice(value)
    return hyperparameters

In [70]:
labels = {
    44: 'class',
    1504: 'Class',
    37: "class",
    1494: 'Class'
}

In [71]:
for dataset_number, label in labels.items():
        dataset = openml.datasets.get_dataset(dataset_number)
        df = dataset.get_data()[0]
        print(df.shape)

(4601, 58)
(1941, 34)
(768, 9)
(1055, 42)


In [73]:
NO_EPOCHS = 5

hparams_results = {}
for epoch in tqdm(range(NO_EPOCHS)):
    chosen_hyperparameters = randomly_choose_hyperparameters(hyperparameters_space_xgboost)

    results = []

    for dataset_number, label in labels.items():
        dataset = openml.datasets.get_dataset(dataset_number)
        df = dataset.get_data()[0]
        df[label] = np.where(df[label] == df[label].cat.categories[0], 0, 1)
        train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df[label])

        X_train = train.drop(label, axis=1)
        y_train = train[label]
        X_test = test.drop(label, axis=1)
        y_test = test[label]

        xgb = XGBClassifier(**chosen_hyperparameters, enable_categorical=True)
        xgb.fit(X_train, y_train)

        y_pred = xgb.predict(X_test)
        auc = roc_auc_score(y_test, y_pred)
        results.append(auc)

    avg_auc = np.mean(results)
    hparams_results[avg_auc] = chosen_hyperparameters

 40%|████      | 2/5 [00:07<00:10,  3.51s/it]