In [1]:
import autorootcwd

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_digits, load_wine, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score

In [3]:
seed = 42
np.random.seed(seed)

In [4]:
# Load Datasets
datasets = {
    'iris': load_iris(return_X_y=True),
    'digits': load_digits(return_X_y=True),
    'wine': load_wine(return_X_y=True),
    'breast_cancer': load_breast_cancer(return_X_y=True)
}

In [18]:
models = {
    'KNN': {
        'model': KNeighborsClassifier,
        'params': {
            'n_neighbors': hp.quniform('n_neighbors', 1, 30, 1),
            'weights': hp.choice('weights', ['uniform', 'distance']),
            'p': hp.choice('p', [1, 2])
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier,
        'params': {
            'n_estimators': hp.quniform('n_estimators', 100, 2000, 1),
            'max_depth': hp.quniform('max_depth', 10, 100, 1),
            'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
            'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
            'bootstrap': hp.choice('bootstrap', [True, False])
        }
    },
    'XGBoost': {
        'model': XGBClassifier,
        'params': {
            'n_estimators': hp.quniform('n_estimators', 50, 1000, 1),
            'max_depth': hp.quniform('max_depth', 1, 10, 1),
            'learning_rate': hp.uniform('learning_rate', 0.001, 0.3),
            'subsample': hp.uniform('subsample', 0.5, 1),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
        }
    }
}


def objective(params, model, X_train, y_train):
    # Convert float to int for parameters that expect integer values
    if 'n_neighbors' in params:
        params['n_neighbors'] = int(params['n_neighbors'])
    if 'n_estimators' in params:
        params['n_estimators'] = int(params['n_estimators'])
    if 'max_depth' in params and params['max_depth'] is not None:
        params['max_depth'] = int(params['max_depth'])
    if 'min_samples_split' in params and params['min_samples_split'] is not None:
        params['min_samples_split'] = int(params['min_samples_split'])
    if 'min_samples_leaf' in params and params['min_samples_leaf'] is not None:
        params['min_samples_leaf'] = int(params['min_samples_leaf'])
        
    clf = model(**params)
    score = -np.mean(cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy'))
    return {'loss': score, 'status': STATUS_OK}



history = []
best_runs = []
for dataset_name, data in datasets.items():
    X, y = data    
    print(f"Dataset: {dataset_name}")

    for model_name, algo in models.items():
        print(f"Model: {model_name}")
        
        model = algo['model']
        space = algo['params']

        trials = Trials()        
        best = fmin(fn=lambda params: objective(params, model, X, y),
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=trials)

        # Store all trials data
        for trial in trials.trials:
            params = {key: value[0] for key, value in trial['misc']['vals'].items()}
            score = abs(round(float(trial['result']['loss']), 4))

            # Add a record to the DataFrame for each trial
            history.append({
                'dataset': dataset_name,
                'model': model_name,
                'score': score,
                'hyperparameters': params,
                'iteration': trial['tid']
            })
        
        # add best score to df_best
        best_runs.append({
            'dataset': dataset_name,
            'model': model_name,
            'score': abs(round(float(trials.best_trial['result']['loss']), 4)),
            'hyperparameters': {key: value[0] for key, value in trials.best_trial['misc']['vals'].items()},
            'iteration': trials.best_trial['tid']
        })


# convert the history to a DataFrame
df_history = pd.DataFrame(history)
df_best = pd.DataFrame(best_runs)


# save the history to a CSV file
df_history.to_csv('Wyniki/bayesian-search-history.csv', index=False)
df_best.to_csv('Wyniki/bayesian-search-best.csv', index=False)

Dataset: iris
Model: KNN
  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [00:00<00:00, 135.39trial/s, best loss: -0.9866666666666667]
Model: RandomForest
100%|██████████| 50/50 [02:15<00:00,  2.70s/trial, best loss: -0.9666666666666668]
Model: XGBoost
100%|██████████| 50/50 [04:39<00:00,  5.59s/trial, best loss: -0.9666666666666668]
Dataset: digits
Model: KNN
100%|██████████| 50/50 [00:02<00:00, 21.96trial/s, best loss: -0.9671711544413494]
Model: RandomForest
100%|██████████| 50/50 [08:27<00:00, 10.15s/trial, best loss: -0.9388053234292789]
Model: XGBoost
100%|██████████| 50/50 [23:15<00:00, 27.91s/trial, best loss: -0.9421402042711235]
Dataset: wine
Model: KNN
100%|██████████| 50/50 [00:00<00:00, 81.99trial/s, best loss: -0.8093650793650793]
Model: RandomForest
100%|██████████| 50/50 [02:16<00:00,  2.73s/trial, best loss: -0.9833333333333332]
Model: XGBoost
100%|██████████| 50/50 [02:30<00:00,  3.01s/trial, best loss: -0.9722222222222221]
Dataset: breast_cancer
Model: KNN
100%|██████████| 50/50 [00:00<00:00, 51.31trial/s, best loss:

In [17]:
df_best

Unnamed: 0,dataset,model,score,hyperparameters,iteration
0,iris,KNN,0.99,"{'n_neighbors': 11.0, 'p': 1, 'weights': 1}",24
1,iris,RandomForest,0.96,"{'bootstrap': 1, 'max_depth': 80.0, 'min_sampl...",45
2,iris,XGBoost,0.97,"{'colsample_bytree': 0.5151471049400276, 'lear...",1
3,digits,KNN,0.97,"{'n_neighbors': 4.0, 'p': 1, 'weights': 1}",21
4,digits,RandomForest,0.94,"{'bootstrap': 0, 'max_depth': 50.0, 'min_sampl...",22
5,digits,XGBoost,0.94,"{'colsample_bytree': 0.5419177009554587, 'lear...",44
6,wine,KNN,0.81,"{'n_neighbors': 1.0, 'p': 0, 'weights': 1}",31
7,wine,RandomForest,0.98,"{'bootstrap': 1, 'max_depth': 80.0, 'min_sampl...",0
8,wine,XGBoost,0.98,"{'colsample_bytree': 0.608075978742906, 'learn...",7
9,breast_cancer,KNN,0.94,"{'n_neighbors': 9.0, 'p': 0, 'weights': 0}",1
