# Tunning XGBoost, Catboost and Lightgbm Classifiers Hyperparameters with Hyperopt

## Import

### Modules

In [328]:
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from hyperopt import STATUS_FAIL, STATUS_OK, Trials, fmin, hp, tpe
from lightgbm import LGBMClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import f1_score, log_loss, roc_auc_score, get_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier


### Data

In [243]:
data = load_breast_cancer()
X = data['data']
y = data['target']

## Data Preprocessing

In [244]:
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.75)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# sss = StratifiedShuffleSplit(n_splits=1, train_size=0.9)
# for train_index, val_index in sss.split(X, y):
#     X_train, X_val = X[train_index], X[val_index]
#     y_train, y_val = y[train_index], y[val_index]

## Setup Estimators

Some basic parameters:

* `learning_rate` [X/L/C]: learning rate (alias: `eta` )
* `max_depth` [X/L/C]: maximum depth of trees
* `n_estimators` [X/L/C]: no. of boosting iterations
* `min_child_weight` [X/L]: minimum sum of instance (hessian) weight needed in a child
* `min_child_samples` [L/C]: minimum no. of data in one leaf
* `subsample` [X/L/C]: subsample ratio of the training instances (note that for CatBoost this parameter can be used only if either Poisson or Bernoulli bootstrap_type is * `selected`)
* `colsample_bytree` [X/L]: subsample ratio of columns in tree building
* `colsample_bylevel` [X/C]: subsample ratio of columns for each level in tree building
* `colsample_bynode` [X]: subsample ratio of columns for each node
* `tree_method` [X]: tree construction method
* `boosting` [L]: tree construction method
* `boosting_type` [C]: Ordered for ordered boosting or Plain for classic
* `early_stopping_rounds` [X/L/C]: parameter for fit() — stop the training if one metric of a validation data does not improve in last early_stopping_rounds rounds
* `eval_metric` [X/L/C]: evaluation metrics for validation data

In [181]:
hyperparameters = {
    "xgb_classifier": {
        'max_depth' : hp.randint('max_depth', 10),
        'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
        'n_estimators' : hp.randint('n_estimators', 250),
        'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
        'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
        }

    ,"lgb_classifier": {
        'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
        'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
        'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
        'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
        'subsample':        hp.uniform('subsample', 0.8, 1),
        'n_estimators':     100,
    }

    ,"ctb_classifier": {
        'learning_rate':     hp.choice('learning_rate',     np.arange(0.05, 0.31, 0.05)),
        'max_depth':         hp.choice('max_depth',         np.arange(5, 16, 1, dtype=int)),
        'colsample_bylevel': hp.choice('colsample_bylevel', np.arange(0.3, 0.8, 0.1)),
        'n_estimators':      100,
    }
}

In [182]:
class select_estimator:
    def __init__(self, X_train, X_val, y_train, y_val):
        self.X = {"train": X_train, "val": X_val}
        self.y = {"train": y_train, "val": y_val}
        self.loss = {"f1_score": []
                    ,"log_loss": []
                    ,"roc_auc_score": []
                    }
        self.metrics = {"f1_score": f1_score
                        ,"log_loss": log_loss
                        ,"roc_auc_score": roc_auc_score
                    }

    def xgb_classifier(self, parameters):
        estimator = XGBClassifier(**parameters, eval_metric="logloss")
        estimator.fit(self.X["train"], self.y["train"])
        return self.get_loss(estimator)

    def ctb_classifier(self, parameters):
        estimator = CatBoostClassifier(**parameters)
        estimator.fit(self.X["train"], self.y["train"])
        return self.get_loss(estimator)

    def lgb_classifier(self, parameters):
        estimator = LGBMClassifier(**parameters)
        estimator.fit(self.X["train"], self.y["train"])
        return self.get_loss(estimator)

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
            
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}

        return result, trials


    def get_loss(self, estimator):

        for dataset_type, X in self.X.items():
            y_pred_class = estimator.predict(X)
            y_pred_proba = estimator.predict_proba(X)[:, 1]

            for metric in self.loss.keys():
                if metric in ["f1_score"]:
                    self.loss[metric].append((1.0 - self.metrics[metric](self.y[dataset_type], y_pred_class))**2)

                elif metric in ["roc_auc_score"]:
                    self.loss[metric].append((1.0 - self.metrics[metric](self.y[dataset_type], y_pred_proba))**2)

                elif metric in ["log_loss"]:
                    self.loss[metric].append((self.metrics[metric](self.y[dataset_type], y_pred_class))**2)
                
        loss_df = pd.DataFrame.from_dict(self.loss)
        loss = loss_df.sum(axis=0).sum()

        return {'loss': np.sqrt(loss), 'status': STATUS_OK}

In [185]:
obj = select_estimator(X_train, X_val, y_train, y_val)

parameters = obj.process(fn_name='xgb_classifier', space=hyperparameters['xgb_classifier'], trials=Trials(), algo=tpe.suggest, max_evals=100)

# Simple XGBoost

In [224]:
space = {
        'max_depth' : hp.randint('max_depth', 10),
        'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
        'n_estimators' : hp.randint('n_estimators', 250),
        'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
        'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
        }

In [225]:
def xgb_cost(space, X_train, y_train, X_val, y_val):

    warnings.filterwarnings(action='ignore')

    classifier = XGBClassifier(n_estimators = space['n_estimators']
                            ,max_depth = int(space['max_depth'])
                            ,learning_rate = space['learning_rate']
                            ,gamma = space['gamma']
                            ,min_child_weight = space['min_child_weight']
                            ,subsample = space['subsample']
                            ,colsample_bytree = space['colsample_bytree']
                            ,use_label_encoder=False
                            ,eval_metric="logloss"
                            )
    classifier.fit(X_train, y_train)
    y_proba_val = classifier.predict_proba(X_val)[:, 1]
    y_class_val = classifier.predict(X_val)

    rocauc_val = roc_auc_score(y_val, y_proba_val)
    f1_val = f1_score(y_val, y_class_val)
    logloss_val = log_loss(y_val, y_class_val)

    y_proba_train = classifier.predict_proba(X_train)[:, 1]
    y_class_train = classifier.predict(X_train)

    # Applying k-Fold Cross Validation
    rocauc_train = roc_auc_score(y_train, y_proba_train)
    f1_train = f1_score(y_train, y_class_train)
    logloss_train = log_loss(y_train, y_class_train)
    
    # print(f"CrossValMean: {CrossValMean}")
    train_proportions = X_train.shape[0]/(X_train.shape[0] + X_val.shape[0])

    loss = ((1.0 - rocauc_val)**2 + (1.0 - f1_val)**2 + logloss_val**2)/train_proportions
    loss += ((1.0 - rocauc_train)**2 + (1.0 - f1_train)**2 + logloss_train**2)/(1-train_proportions)

    return {'loss': np.sqrt(loss), 'status': STATUS_OK }

In [226]:
objective = lambda x: cost(x, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val)

In [227]:
trials = Trials()

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print("Best: ", best)

 # Fitting XGBoost to the Training set



100%|██████████| 100/100 [00:10<00:00,  9.55trial/s, best loss: 3.3280711111346004e-15]
Best:  {'colsample_bytree': 0.38, 'gamma': 0.22, 'learning_rate': 0.45, 'max_depth': 9, 'min_child_weight': 1.0, 'n_estimators': 179, 'subsample': 0.65}


In [228]:
best

{'colsample_bytree': 0.38,
 'gamma': 0.22,
 'learning_rate': 0.45,
 'max_depth': 9,
 'min_child_weight': 1.0,
 'n_estimators': 179,
 'subsample': 0.65}

In [125]:
classifier = XGBClassifier(**best)
classifier.fit(X_train, y_train)

 

# Applying k-Fold Cross Validation

from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)

CrossValMean = accuracies.mean()

print("Final CrossValMean: ", CrossValMean)

 

CrossValSTD = accuracies.std()

 

# Predicting the Test set results

y_pred = classifier.predict(X_test)

y_pred = pd.DataFrame(y_pred)

y_pred.columns = ['Survived']

submission = submission.join(y_pred)

 

# Exporting dataset to csv

submission.to_csv("Titanic_Submission.csv", index=False, sep=',')

Final CrossValMean:  0.9628205128205127


NameError: name 'pd' is not defined

# New

In [187]:
space = {
    "classifier" : hp.choice('classifier', ["XGBClassifier", "CatBoostClassifier", "LGBMClassifier"])
    }

In [203]:
def fun(x):
    output =  {"XGBClassifier": 1, "CatBoostClassifier": 0, "LGBMClassifier": -1}
    return output[x["classifier"]]

In [204]:
trials = Trials()

best = fmin(fn=fun,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print("Best: ", best)

100%|██████████| 100/100 [00:00<00:00, 416.05trial/s, best loss: -1.0]
Best:  {'classifier': 2}


In [369]:
from abc import ABC, abstractmethod

class BayesSearchCV:

    def __init__(self, estimator, param_distributions, n_iter=10
        ,scoring: list[str]=None, cv=None, random_state=None
        ,algo=tpe.suggest, trials=Trials()) -> None:
        self.estimator = estimator
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.random_state = random_state
        self.cv = cv
        self.algo = algo
        self.trials = trials
        self.scoring = scoring

    def fit(self, X: pd.DataFrame, y=None):
        objective = lambda space: self._cost(space, X=X, y=y)
        try:
            result = fmin(fn=objective, space=self.param_distributions
                        ,algo=self.algo, max_evals=self.n_iter
                        ,trials=self.trials)
            
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}

        self.best_estimator_ = self._instantiate_estimator(result)
        self.best_estimator_.fit(X, y)
        return

    def _get_splits(self, X: pd.DataFrame, y=None):
        if not isinstance(self.cv, StratifiedKFold):
            msg = f"Cross validation not yet implemented for type {type(self.cv)}"
            NotImplementedError(msg)
            
        for train_index, test_index in self.cv.split(X, y):
            yield X[train_index], X[test_index], y[train_index], y[test_index]

    def _cost(self, hyperparameters, X, y):
        estimator = self._instantiate_estimator(hyperparameters)

        loss_dict = {metric_name: [] for metric_name in self.scoring}

        for X_train, X_val, y_train, y_val in self._get_splits(X, y):
            estimator.fit(X_train, y_train)

            for p_metric in self.scoring:
                scorer = get_scorer(p_metric)
                if p_metric in ["f1_score"]:
                    loss_dict[p_metric].append((1.0 - scorer(estimator, X_train, y_train))**2)
                    loss_dict[p_metric].append((1.0 - scorer(estimator, X_train, y_train))**2)

                elif p_metric in ["roc_auc_score"]:
                    loss_dict[p_metric].append((1.0 - scorer(estimator, X_train, y_train))**2)
                    loss_dict[p_metric].append((1.0 - scorer(estimator, X_val, y_val))**2)

                elif p_metric in ["log_loss"]:
                    loss_dict[p_metric].append((scorer(estimator, X_train, y_train))**2)
                    loss_dict[p_metric].append((scorer(estimator, X_train, y_train))**2)
                
        loss_df = pd.DataFrame.from_dict(loss_dict)
        loss = loss_df.sum(axis=0).sum()

        return {'loss': np.sqrt(loss), 'status': STATUS_OK}

    def _instantiate_estimator(self, hyperarameters):
        estimator_cls = self.estimator.__class__
        return estimator_cls(**hyperarameters)

In [370]:
space = {
        'max_depth' : hp.randint('max_depth', 10),
        'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
        'n_estimators' : hp.randint('n_estimators', 250),
        'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
        'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
        }
cv = BayesSearchCV(XGBClassifier(learning_rate=0.1), param_distributions=space, scoring=["roc_auc"], cv=StratifiedKFold())

In [371]:
cv.fit(X_train, y_train)

100%|██████████| 10/10 [00:05<00:00,  1.73trial/s, best loss: 0.0]


In [372]:
estimator = cv.best_estimator_

In [373]:
estimator

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.55, gamma=0.08, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.27, max_delta_step=0, max_depth=8,
              min_child_weight=7.0, missing=nan, monotone_constraints='()',
              n_estimators=212, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.77,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [353]:
pm = sklearn.metrics.SCORERS["f1"]
pm()

TypeError: __call__() missing 3 required positional arguments: 'estimator', 'X', and 'y_true'

In [354]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [342]:
get_scorer("f1")

make_scorer(f1_score, average=binary)

In [296]:
module = __import__()
class_ = getattr(module, model.__class__.__name__)
instance = class_()

ModuleNotFoundError: No module named 'XGBClassifier'

In [322]:
a = XGBClassifier(learning_rate=0.1)
k2 = a.__class__
a2 = k2(colsample_bytree=0.5)

In [323]:
a2

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.5, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)