# Tunning XGBoost, Catboost and Lightgbm Classifiers Hyperparameters with Hyperopt

## Import

### Modules

In [328]:
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from hyperopt import STATUS_FAIL, STATUS_OK, Trials, fmin, hp, tpe
from lightgbm import LGBMClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import f1_score, log_loss, roc_auc_score, get_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier


### Data

In [243]:
data = load_breast_cancer()
X = data['data']
y = data['target']

## Data Preprocessing

In [244]:
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.75)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# sss = StratifiedShuffleSplit(n_splits=1, train_size=0.9)
# for train_index, val_index in sss.split(X, y):
#     X_train, X_val = X[train_index], X[val_index]
#     y_train, y_val = y[train_index], y[val_index]

## Setup Estimators

Some basic parameters:

* `learning_rate` [X/L/C]: learning rate (alias: `eta` )
* `max_depth` [X/L/C]: maximum depth of trees
* `n_estimators` [X/L/C]: no. of boosting iterations
* `min_child_weight` [X/L]: minimum sum of instance (hessian) weight needed in a child
* `min_child_samples` [L/C]: minimum no. of data in one leaf
* `subsample` [X/L/C]: subsample ratio of the training instances (note that for CatBoost this parameter can be used only if either Poisson or Bernoulli bootstrap_type is * `selected`)
* `colsample_bytree` [X/L]: subsample ratio of columns in tree building
* `colsample_bylevel` [X/C]: subsample ratio of columns for each level in tree building
* `colsample_bynode` [X]: subsample ratio of columns for each node
* `tree_method` [X]: tree construction method
* `boosting` [L]: tree construction method
* `boosting_type` [C]: Ordered for ordered boosting or Plain for classic
* `early_stopping_rounds` [X/L/C]: parameter for fit() — stop the training if one metric of a validation data does not improve in last early_stopping_rounds rounds
* `eval_metric` [X/L/C]: evaluation metrics for validation data

In [181]:
hyperparameters = {
    "xgb_classifier": {
        'max_depth' : hp.randint('max_depth', 10),
        'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
        'n_estimators' : hp.randint('n_estimators', 250),
        'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
        'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
        }

    ,"lgb_classifier": {
        'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
        'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
        'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
        'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
        'subsample':        hp.uniform('subsample', 0.8, 1),
        'n_estimators':     100,
    }

    ,"ctb_classifier": {
        'learning_rate':     hp.choice('learning_rate',     np.arange(0.05, 0.31, 0.05)),
        'max_depth':         hp.choice('max_depth',         np.arange(5, 16, 1, dtype=int)),
        'colsample_bylevel': hp.choice('colsample_bylevel', np.arange(0.3, 0.8, 0.1)),
        'n_estimators':      100,
    }
}

In [182]:
class select_estimator:
    def __init__(self, X_train, X_val, y_train, y_val):
        self.X = {"train": X_train, "val": X_val}
        self.y = {"train": y_train, "val": y_val}
        self.loss = {"f1_score": []
                    ,"log_loss": []
                    ,"roc_auc_score": []
                    }
        self.metrics = {"f1_score": f1_score
                        ,"log_loss": log_loss
                        ,"roc_auc_score": roc_auc_score
                    }

    def xgb_classifier(self, parameters):
        estimator = XGBClassifier(**parameters, eval_metric="logloss")
        estimator.fit(self.X["train"], self.y["train"])
        return self.get_loss(estimator)

    def ctb_classifier(self, parameters):
        estimator = CatBoostClassifier(**parameters)
        estimator.fit(self.X["train"], self.y["train"])
        return self.get_loss(estimator)

    def lgb_classifier(self, parameters):
        estimator = LGBMClassifier(**parameters)
        estimator.fit(self.X["train"], self.y["train"])
        return self.get_loss(estimator)

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
            
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}

        return result, trials


    def get_loss(self, estimator):

        for dataset_type, X in self.X.items():
            y_pred_class = estimator.predict(X)
            y_pred_proba = estimator.predict_proba(X)[:, 1]

            for metric in self.loss.keys():
                if metric in ["f1_score"]:
                    self.loss[metric].append((1.0 - self.metrics[metric](self.y[dataset_type], y_pred_class))**2)

                elif metric in ["roc_auc_score"]:
                    self.loss[metric].append((1.0 - self.metrics[metric](self.y[dataset_type], y_pred_proba))**2)

                elif metric in ["log_loss"]:
                    self.loss[metric].append((self.metrics[metric](self.y[dataset_type], y_pred_class))**2)
                
        loss_df = pd.DataFrame.from_dict(self.loss)
        loss = loss_df.sum(axis=0).sum()

        return {'loss': np.sqrt(loss), 'status': STATUS_OK}

In [185]:
obj = select_estimator(X_train, X_val, y_train, y_val)

parameters = obj.process(fn_name='xgb_classifier', space=hyperparameters['xgb_classifier'], trials=Trials(), algo=tpe.suggest, max_evals=100)

# Simple XGBoost

In [224]:
space = {
        'max_depth' : hp.randint('max_depth', 10),
        'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
        'n_estimators' : hp.randint('n_estimators', 250),
        'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
        'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
        }

In [225]:
def xgb_cost(space, X_train, y_train, X_val, y_val):

    warnings.filterwarnings(action='ignore')

    classifier = XGBClassifier(n_estimators = space['n_estimators']
                            ,max_depth = int(space['max_depth'])
                            ,learning_rate = space['learning_rate']
                            ,gamma = space['gamma']
                            ,min_child_weight = space['min_child_weight']
                            ,subsample = space['subsample']
                            ,colsample_bytree = space['colsample_bytree']
                            ,use_label_encoder=False
                            ,eval_metric="logloss"
                            )
    classifier.fit(X_train, y_train)
    y_proba_val = classifier.predict_proba(X_val)[:, 1]
    y_class_val = classifier.predict(X_val)

    rocauc_val = roc_auc_score(y_val, y_proba_val)
    f1_val = f1_score(y_val, y_class_val)
    logloss_val = log_loss(y_val, y_class_val)

    y_proba_train = classifier.predict_proba(X_train)[:, 1]
    y_class_train = classifier.predict(X_train)

    # Applying k-Fold Cross Validation
    rocauc_train = roc_auc_score(y_train, y_proba_train)
    f1_train = f1_score(y_train, y_class_train)
    logloss_train = log_loss(y_train, y_class_train)
    
    # print(f"CrossValMean: {CrossValMean}")
    train_proportions = X_train.shape[0]/(X_train.shape[0] + X_val.shape[0])

    loss = ((1.0 - rocauc_val)**2 + (1.0 - f1_val)**2 + logloss_val**2)/train_proportions
    loss += ((1.0 - rocauc_train)**2 + (1.0 - f1_train)**2 + logloss_train**2)/(1-train_proportions)

    return {'loss': np.sqrt(loss), 'status': STATUS_OK }

In [226]:
objective = lambda x: cost(x, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val)

In [227]:
trials = Trials()

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print("Best: ", best)

 # Fitting XGBoost to the Training set



100%|██████████| 100/100 [00:10<00:00,  9.55trial/s, best loss: 3.3280711111346004e-15]
Best:  {'colsample_bytree': 0.38, 'gamma': 0.22, 'learning_rate': 0.45, 'max_depth': 9, 'min_child_weight': 1.0, 'n_estimators': 179, 'subsample': 0.65}


In [228]:
best

{'colsample_bytree': 0.38,
 'gamma': 0.22,
 'learning_rate': 0.45,
 'max_depth': 9,
 'min_child_weight': 1.0,
 'n_estimators': 179,
 'subsample': 0.65}

In [125]:
classifier = XGBClassifier(**best)
classifier.fit(X_train, y_train)

 

# Applying k-Fold Cross Validation

from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)

CrossValMean = accuracies.mean()

print("Final CrossValMean: ", CrossValMean)

 

CrossValSTD = accuracies.std()

 

# Predicting the Test set results

y_pred = classifier.predict(X_test)

y_pred = pd.DataFrame(y_pred)

y_pred.columns = ['Survived']

submission = submission.join(y_pred)

 

# Exporting dataset to csv

submission.to_csv("Titanic_Submission.csv", index=False, sep=',')

Final CrossValMean:  0.9628205128205127


NameError: name 'pd' is not defined

# Bayes Search CV

In [590]:
import sys
import traceback
from abc import ABC, abstractmethod
from collections.abc import Iterable
from typing import Union

from hyperopt import STATUS_FAIL, STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import KFold


class BayesSearchCV(ABC):

    def __init__(self, estimator, param_distributions: dict, scoring: list[str]
        ,n_iter: int=10, cv: Union[int, Iterable]=5, random_state: int=None
        ,algo=tpe.suggest, trials: Trials=Trials()) -> None:
        """[summary]

        Args:
            estimator: Estimator object
            param_distributions (dict): Search space containing hyperparameters
            scoring (list[str]): List of performance metrics to measure the estimator performance.
                                Select one from sklearn.metrics.SCORERS.keys()
            n_iter (int, optional): Max number of iterations. Defaults to 10.
            cv (int or Iterable, optional): int, cross-validation generator or an iterable. Defaults to 5.
            random_state (int, optional): Pseudo random number generator state used for random uniform sampling. Defaults to None.
            algo (optional): Algorithm to for distribution search. Defaults to tpe.suggest.
            trials (Trials, optional): [description]. Defaults to Trials().
        """
        self.estimator = estimator
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.random_state = random_state
        self.cv = cv
        self.algo = algo
        self.trials = trials
        self.scoring = scoring

    def fit(self, X: pd.DataFrame, y=None):
        """Fit estimator with optimal hyperparameters

        Args:
            X (pd.DataFrame): Predictors
            y (pd.DataFrame): Target
        """
        self.cv_results_ = pd.DataFrame()
        min_loss = np.inf

        for train_index, val_index in self._get_splits(X, y):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            objective = lambda space: self._cost(X=X_train, y=y_train, hyperparameters=space)

            try:
                hyperparameters = fmin(fn=objective, space=self.param_distributions
                            ,algo=self.algo, max_evals=self.n_iter
                            ,trials=self.trials)

            except KeyError:
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)
                return {'status': STATUS_FAIL,
                        'exception': str(sys.exc_info())}

            estimator, loss_df, loss = self._cost(X_val, y_val, hyperparameters, return_fit_estimator=True)

            if loss < min_loss:
                min_loss = loss
                self.best_estimator_ = estimator

            self.cv_results_ = pd.concat([self.cv_results_ , loss_df.copy()])

        self.cv_results_.reset_index(inplace=True, drop=True)
        self.cv_results_.rename(columns={col: f"{col}_loss" for col in self.cv_results_.columns}, inplace=True)


    def _get_splits(self, X: pd.DataFrame, y=None):
        """Instantiate and/or get training and validation datasets

        Args:
            X (pd.DataFrame): Predictor
            y (pd.DataFrame): Target

        Returns:
            [type]: [description]

        Yields:
            [type]: [description]
        """

        if isinstance(self.cv, int): 
            self.cv = KFold(n_splits=self.cv, random_state=self.random_state)

        elif isinstance(self.cv, StratifiedKFold):
            pass

        else:
            msg = f"Cross validation not yet implemented for type {type(self.cv)}"
            NotImplementedError(msg)
            
        for train_index, test_index in self.cv.split(X, y):
            yield train_index, test_index


    def _cost(self, X: pd.DataFrame, y: pd.DataFrame, hyperparameters: dict
            ,return_fit_estimator: bool=False):
        """Evaluates the cost function for the trained estimator

        Args:
            X (pd.DataFrame): Predictor
            y (pd.DataFrame): Target
            hyperarameters (dict):
        """
        estimator = self._instantiate_estimator(hyperparameters)

        loss_dict = {metric_name: [] for metric_name in self.scoring}

        estimator.fit(X, y)

        for p_metric in self.scoring:
            scorer = get_scorer(p_metric)
            
            if p_metric in ["f1"]:
                loss_dict[p_metric].append((1.0 - scorer(estimator, X, y))**2)

            elif p_metric in ["roc_auc"]:
                loss_dict[p_metric].append((1.0 - scorer(estimator, X, y))**2)

            elif p_metric in ["neg_log_loss"]:
                loss_dict[p_metric].append((scorer(estimator, X, y))**2)

            else:
                msg = f"Metric {p_metric} not implemented."
                raise NotImplementedError(msg)

        loss_df = pd.DataFrame.from_dict(loss_dict)
        loss = loss_df.sum(axis=0).sum()

        if return_fit_estimator:
            return estimator, loss_df, loss

        return {'loss': np.sqrt(loss), 'status': STATUS_OK}



    def _instantiate_estimator(self, hyperarameters: dict):
        """Instantiate estimator with selected hyperparameters

        Args:
            hyperarameters (dict):

        Returns:
            [type]: Estimator
        """
        estimator_cls = self.estimator.__class__
        return estimator_cls(**hyperarameters)


In [591]:
space = {
        'max_depth' : hp.randint('max_depth', 10),
        'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
        'n_estimators' : hp.randint('n_estimators', 250),
        'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
        'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
        'eval_metric': "logloss"
        }

cv = BayesSearchCV(XGBClassifier(learning_rate=0.1), param_distributions=space, scoring=["roc_auc", "f1"], cv=StratifiedKFold())
cv.fit(X_train, y_train)

100%|██████████| 10/10 [00:00<00:00, 18.44trial/s, best loss: 0.009360113324801356]
100%|██████████| 10/10 [00:00<?, ?trial/s, best loss=?]
100%|██████████| 10/10 [00:00<?, ?trial/s, best loss=?]
100%|██████████| 10/10 [00:00<?, ?trial/s, best loss=?]
100%|██████████| 10/10 [00:00<?, ?trial/s, best loss=?]


In [592]:
cv.cv_results_

Unnamed: 0,roc_auc_loss,f1_loss
0,1.013066e-05,0.000331
1,6.030812e-05,0.000758
2,3.128894e-06,0.000343
3,3.476549e-07,9.1e-05
4,8.691371e-08,8.7e-05


In [593]:
cv.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.55, gamma=0.28, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.26, max_delta_step=0, max_depth=9,
              min_child_weight=6.0, missing=nan, monotone_constraints='()',
              n_estimators=93, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.99,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [472]:
space = {"lgb_classifier": {
        'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
        'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
        'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
        'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
        'subsample':        hp.uniform('subsample', 0.8, 1),
        # 'n_estimators':     100,
    }}
cv = ClassifierBayesSearchCV(LGBMClassifier(), param_distributions=space, scoring=["roc_auc", "f1"], cv=StratifiedKFold())
cv.fit(X_train, y_train)
# cv.best_estimator_

100%|██████████| 10/10 [00:00<?, ?trial/s, best loss=?]


{'colsample_bytree': 0.41000000000000003,
 'gamma': 0.32,
 'learning_rate': 0.5,
 'max_depth': 5,
 'min_child_weight': 8.0,
 'n_estimators': 137,
 'subsample': 0.13}

In [353]:
pm = sklearn.metrics.SCORERS["f1"]
pm()

TypeError: __call__() missing 3 required positional arguments: 'estimator', 'X', and 'y_true'

In [517]:
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [453]:
get_scorer("f1_score")

ValueError: 'f1_score' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.

In [296]:
module = __import__()
class_ = getattr(module, model.__class__.__name__)
instance = class_()

ModuleNotFoundError: No module named 'XGBClassifier'

In [379]:
a = XGBClassifier(learning_rate=0.1, eval_metric="logloss")