In [1]:
import openml
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

In [2]:
HYPERPARAMETERS_SPACE_RFC = {
    'n_estimators': list(range(1, 2001)),
    'max_depth': [None] + list(range(10, 51)),
    'min_samples_split': list(range(2, 11)),
    'min_samples_leaf': list(range(1, 5)),
    'max_features': ['auto', 'sqrt', 'log2'] + list(np.arange(0.1, 1.1, 0.1)),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced'],
    'max_samples': [None] + list(np.arange(0.1, 1.1, 0.1)),
}

In [3]:
HYPERPARAMETERS_SPACE_SVM = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': 2.0 ** np.arange(-10, 11),
    'gamma': 2.0 ** np.arange(-10, 11),
    'degree': np.arange(2, 6, 1)
}

In [4]:
HYPERPARAMETERS_SPACE_XGB = {
    "n_estimators": np.arange(1, 1000),
    "max_depth": np.arange(1, 1000),
    "max_leaves": np.arange(0, 10000),
    'min_child_weight':np.arange(1, 50, 1),
    "grow_policy": ['depthwise', 'lossguide'],
    "learning_rate": np.random.uniform(0, 1, 100),
    "booster": ['gbtree', 'gblinear', 'dart'],
    "gamma": np.random.uniform(0, 1, 100),
    "subsumple": np.random.uniform(0, 1, 100),
    "colsample_bytree": np.random.uniform(0, 1, 100),
    "reg_alpha": [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100],
    "reg_lambda": [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100],
}

In [5]:
def randomly_choose_hyperparameters(hyperparameters_space):
    hyperparameters = {}
    for key, value in hyperparameters_space.items():
        hyperparameters[key] = np.random.choice(value)
    return hyperparameters

In [6]:
labels = {
    44: 'class',
    1504: 'Class',
    37: 'class',
    1494: 'Class'
}

In [7]:
NO_ITER = 2

hparams_results = {}
for epoch in tqdm(range(NO_ITER)):
    while True:
        try:
            # chosen_hyperparameters = randomly_choose_hyperparameters(HYPERPARAMETERS_SPACE_RFC)
            # chosen_hyperparameters = randomly_choose_hyperparameters(HYPERPARAMETERS_SPACE_SVM)
            chosen_hyperparameters = randomly_choose_hyperparameters(HYPERPARAMETERS_SPACE_XGB)

            results = []

            for dataset_number, label in labels.items():
                dataset = openml.datasets.get_dataset(dataset_number)
                df = dataset.get_data()[0]
                df[label] = np.where(df[label] == df[label].cat.categories[0], 0, 1)
                train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df[label])

                X_train = train.drop(label, axis=1)
                y_train = train[label]
                X_test = test.drop(label, axis=1)
                y_test = test[label]

                # clf = RandomForestClassifier(**chosen_hyperparameters)
                # clf = SVC(**chosen_hyperparameters)
                clf = XGBClassifier(**chosen_hyperparameters, enable_categorical=True)
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_test)
                auc = roc_auc_score(y_test, y_pred)
                results.append(auc)

            avg_auc = np.mean(results)

            if avg_auc in hparams_results:
                hparams_results[avg_auc].append(chosen_hyperparameters)
            else:
                hparams_results[avg_auc] = [chosen_hyperparameters]

            break

        except:
            print('Error occured')
            continue


  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:01<00:00,  1.32it/s]


In [8]:
best_avg_auc = max(hparams_results.keys())
hparams_results[best_avg_auc][0]

{'n_estimators': 67,
 'max_depth': 27,
 'max_leaves': 7616,
 'min_child_weight': 13,
 'grow_policy': 'lossguide',
 'learning_rate': 0.7772091164648717,
 'booster': 'gblinear',
 'gamma': 0.347198107322894,
 'subsumple': 0.9331275642364245,
 'colsample_bytree': 0.9992961378955276,
 'reg_alpha': 0.0001,
 'reg_lambda': 0.001}

In [9]:
best_avg_auc

0.8538170751450929