# LightGBM

In [1]:
import itertools
import lightgbm
import json
import pickle
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from bayes_opt.observer import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

class LightGBM_binary_bayes_opt:

    def __init__(self, X_train, y_train, X_eval, y_eval, X_test, y_test, \
                 base_params:dict, cat_params:dict, \
                 num_params:dict, int_params:dict, \
                 load_log=False, num_opts=100, score_ouput='./score.json'):
        """
        Class instance initializer.
        """
        # data
        self.X_train, self.X_eval, self.X_test = X_train, X_eval, X_test
        self.y_train, self.y_eval, self.y_test = y_train, y_eval, y_test
        # hyperparamters
        self.base_params = base_params
        self.cat_params = cat_params
        self.num_params = num_params
        self.int_params = int_params
        self.cat_params_combinations = []
        self.optimizers = []
        self.current_params = None
        self.all_params = None
        # bayesian optimizer
        self.load_log = load_log
        self.num_opts = num_opts
        # initializing
        self._generate_cat_params_combination()
        # models
        self.lgbs = []
        # metrics
        self.score_ouput = score_ouput
        self.train_score = []
        self.eval_score = []
        self.test_score = []

    def _generate_cat_params_combination(self):
        """
        Generate categorical hyperparameters combinations.
        """
        for combination in itertools.product(*self.cat_params.values()):
            self.cat_params_combinations.append(dict(zip(cat_params.keys(), combination)))

    def _object_score(self, **params_to_optimize):
        """
        Call by Bayeisan optimizer to maximize.
        """
        # convert int dtype parameters in 'num_params' to int parameters
        for _int_param in self.int_params:
            if _int_param in params_to_optimize:
                params_to_optimize[_int_param] = int(params_to_optimize[_int_param])
        # combine all hyperparameters
        self.all_params = dict(self.current_params, **params_to_optimize)
        return self.train_lgb(self.all_params)

    def train_lgb(self, all_params:dict):
        """
        Using all hyperparameters to train LightGBM. Return the object function score.
        """
        train_dataset = lightgbm.Dataset(data=self.X_train, label=self.y_train)
        eval_dataset = lightgbm.Dataset(data=self.X_eval, label=self.y_eval)
        test_dataset = lightgbm.Dataset(data=self.X_test, label=self.y_test)
        lgb_clf = lightgbm.train(params=all_params, \
                                 train_set=train_dataset, \
                                 num_boost_round=3000, \
                                 valid_sets=[train_dataset, eval_dataset], \
                                 valid_names=['Train', 'Eval'], \
                                 early_stopping_rounds = 400, \
                                 verbose_eval = -1, \
                                 learning_rates=lambda iters: 0.6 * (0.99 ** iters))
        # store model
        self.lgbs.append(lgb_clf)
        # store score and save to file
        self.train_score.append(lgb_clf.best_score['Train'])
        self.eval_score.append(lgb_clf.best_score['Eval'])
        with open(self.score_ouput, 'a+') as f:
            json.dump(lgb_clf.best_score, f)
            f.write("\n")

        return lgb_clf.best_score['Eval']['auc']
    
    def optimize_lgb(self):
        """
        The main entrence of optimizing LightGBM
        """
        # we need to manuall go through the categorical hyperparameter combinations
        # then apply Bayesian optimization
        for idx, cat_params_combination in enumerate(self.cat_params_combinations):
            print('Current categorical hyperparameters combination #{:d}: {}'\
                  .format(idx, cat_params_combination))
            # combine base and current categorical combination to form current_params
            self.current_params = dict(self.base_params, **cat_params_combination)
            # create optimizer
            optimizer = BayesianOptimization(f=self._object_score, \
                                             pbounds=self.num_params, \
                                             random_state=1213)
            
            # if previous log exist, load it and continue optimizing
            log_path = "./logs_{:d}.json".format(idx)
            if self.load_log:
                load_logs(optimizer, logs=[log_path])
                self.logger = JSONLogger(path=log_path)
                optimizer.subscribe(Events.OPTMIZATION_STEP, self.logger)
                optimizer.maximize(init_points=0, n_iter=self.num_opts)
            else:
                self.logger = JSONLogger(path=log_path)
                optimizer.subscribe(Events.OPTMIZATION_STEP, self.logger)
                optimizer.maximize(init_points=10, n_iter=self.num_opts)
            
            # save current opotimizer
            self.optimizers.append(optimizer)

class LightGBM_binary_hyperopt:
    """
    Recommand to use hyperopt
    """
    def __init__(self, X_train, y_train, X_eval, y_eval, X_test, y_test, \
                 base_params: dict, cat_params: dict, int_params: dict, float_params: dict, \
                 num_opts=1000, trials_path='./trials.pkl', load_trials=False, \
                 lgb_num_boost_round=3000, lgb_early_stopping_rounds=400):
        """
        Class instance initializer.
        """
        # data
        self.X_train, self.X_eval, self.X_test = X_train, X_eval, X_test
        self.y_train, self.y_eval, self.y_test = y_train, y_eval, y_test
        self.train_dataset = lightgbm.Dataset(data=self.X_train, label=self.y_train)
        self.eval_dataset = lightgbm.Dataset(data=self.X_eval, label=self.y_eval)
        self.test_dateset = lightgbm.Dataset(data=self.X_test, label=self.y_test)
        # hyperparameters
        self.base_params, self.cat_params, self.int_params, self.float_params = base_params, cat_params, int_params, float_params
        self.all_params = self._init_params()
        # lightgbm other hyperparameter
        self.lgb_num_boost_round, self.lgb_early_stopping_rounds = lgb_num_boost_round, lgb_early_stopping_rounds
        # optimizer
        self.num_opts, self.trials_path, self.load_trials = num_opts, trials_path, load_trials
        self.trials = self._init_trials()
    
    def _init_params(self):
        """
        Initialize hyperparameters
        """
        # categorical hyperparameters
        self.cat_params_hp = {param: hp.choice(param, candidates) \
                      for param, candidates in self.cat_params.items()}
        # integer hyperparameters
        self.int_params_hp = {param: hp.choice(param, np.arange(*start_end_step, dtype=np.int)) \
                              for param, start_end_step in self.int_params.items()}
        # float hyperparameters
        self.float_params_hp = {param: hp.uniform(param, *candidates) \
                                for param, candidates in self.float_params.items()}
        # generate all hyperparameters
        return dict(self.base_params, \
                    **self.cat_params_hp, \
                    **self.int_params_hp, \
                    **self.float_params_hp)

    def _init_trials(self):
        """
        Initialize trials database
        """
        if self.load_trials:
            trials = pickle.load(open(self.trials_path, "rb"))
            current_iter = len(trials.losses())
            self.num_opts += current_iter
        else:
            trials = Trials()
        return trials
    
    def _object_score(self, params):
        """
        Using all hyperparameters to train LightGBM. Return the objective function score.
        """
        lgb_clf = lightgbm.train(params=params, \
                                 train_set=self.train_dataset, \
                                 num_boost_round=self.lgb_num_boost_round, \
                                 valid_sets=[self.train_dataset, self.eval_dataset], \
                                 valid_names=['Train', 'Eval'], \
                                 early_stopping_rounds=self.lgb_early_stopping_rounds, \
                                 verbose_eval = -1, \
                                 learning_rates=lambda iters: 0.6 * (0.99 ** iters))
        # we invoke difference between train auc and eval auc as penalty
        # eval_auc - (train_auc - eval_auc)
        # that is maximize inverse of the above formula
        return {'loss': -(2*lgb_clf.best_score['Eval']['auc'] - lgb_clf.best_score['Train']['auc']), \
                'train_auc': lgb_clf.best_score['Train']['auc'], \
                'eval_auc': lgb_clf.best_score['Eval']['auc'], \
                'train_error': lgb_clf.best_score['Train']['binary_error'], \
                'eval_error': lgb_clf.best_score['Eval']['binary_error'], \
                'status': STATUS_OK}

    def optimize_lgb(self):
        """
        The main entrence of optimizing LightGBM
        """
        best_params = fmin(self._object_score, self.all_params, algo=tpe.suggest, \
                           max_evals = self.num_opts, trials=self.trials)
        # save trials for further fine-tune
        pickle.dump(self.trials, open(self.trials_path, "wb"))
        # store best hyperparameters
        self.best_params = best_params
        
        return best_params

    def best_model(self):
        """
        Use best hyperparameters to train lihgtgbm model
        """
        lgb_clf = lightgbm.train(params=self.best_params, \
                                 train_set=train_dataset, \
                                 num_boost_round=self.lgb_num_boost_round, \
                                 valid_sets=[train_dataset, eval_dataset], \
                                 valid_names=['Train', 'Eval'], \
                                 early_stopping_rounds=self.lgb_early_stopping_rounds, \
                                 verbose_eval = -1, \
                                 learning_rates=lambda iters: 0.6 * (0.99 ** iters))
        return lgb_clf

In [2]:
# @ unused hyperparameters
#               'drop_rate' | used only in dart
#               'max_drop' | used only in dart
#               'skip_drop' | used only in dart
#               'xgboost_dart_mode' | used only in dart
#               'uniform_drop' | used only in dart
#               'drop_seed' | used only in dart
#               'top_rate' | | for safety not used
#               'other_rate' | for safety not used
#               'top_k' | used only in Voteing parallel
#               'monotone_constraints' | default
#               'feature_contri' | default
#               'forcedsplits_filename' | default
#               'forcedbins_filename' | default
#               'refit_decay_rate' | used only in refit task
#               'cegb_penalty_feature_lazy' | default
#               'cegb_penalty_feature_coupled' | default
#               'cegb_tradeoff': (1, 10), \
#               'cegb_penalty_split': (0 ,10), \

## Bayes optimizer

In [4]:
base_params = {'task': 'train', \
               'objective': 'binary', \
               'tree_learner': 'serial', \
               'num_threads': 4, \
               'device_type': 'cpu', \
               'seed': 1213, \
               'num_leaves': 500, \
               ## learning control parameters
               'bagging_seed': 42, \
               'feature_fraction_seed': 3, \
               'first_metric_only': False, \
               'max_delta_step': 0, \
               'min_sum_hessian_in_leaf': 0.05, \
               'bagging_fraction': 1, \
               'pos_bagging_fraction': 1, \
               'neg_bagging_fraction': 1, \
               'bagging_freq': 0, \
               'feature_fraction': 1, \
               'feature_fraction_bynode': 1, \
               'min_data_in_leaf': 250, \
               'lambda_l1': 250, \
               ## IO parameters
               'bin_construct_sample_cnt': 200000, \
               'histogram_pool_size': -1, \
               ## objective parameters
               'is_unbalance': True, \
               'metric': 'auc,binary_logloss,binary_error', \
               'metric_freq': 1, \
               'max_bin': 511, \
               'min_data_in_bin': 3, \
               'min_gain_to_split': 0, \
               ### parameters for categorical features
               'min_data_per_group': 100, \
               'max_cat_threshold': 32, \
               'cat_l2': 500, \
               'cat_smoth': 500
              }

cat_params = {'boosting': ['gbdt']}

num_params = {
#               'num_leaves': (2, 1024), \
              ## learning control parameters
              'max_depth': (1, 200), \
#               'min_data_in_leaf': (2, 500), \
#               'min_sum_hessian_in_leaf': (0, 0.1), \
#               'bagging_fraction': (0.1, 1), \
#               'pos_bagging_fraction': (0.1, 1), \
#               'neg_bagging_fraction': (0.1, 1), \
#               'bagging_freq': (0, 100), \
#               'feature_fraction': (0.1, 1), \
#               'feature_fraction_bynode': (0.1, 1), \
#               'lambda_l1': (0, 500), \
              'lambda_l2': (1000, 4000), \
              ## objective parameters
              'sigmoid': (0.1, 500), \
              ### parameters for categorical features
#               'cat_l2': (10, 1000), \
#               'cat_smoth': (10, 1000), \
              'max_cat_to_onehot': (1, 100)
             }

int_params = ['num_leaves', 'max_depth', 'min_data_in_leaf', 'bagging_freq', \
              'min_data_per_group', 'max_cat_threshold', 'max_cat_to_onehot', \
              'max_bin', 'min_data_in_bin', 'max_cat_to_onehot']

In [None]:
lgb_bayes = LightGBM_binary_bayes_opt(X_train, y_train, X_eval, y_eval, X_test, y_test, \
                                      base_params, cat_params, num_params, int_params, \
                                      load_log=False, num_opts=500)
lgb_bayes.optimize_lgb()

## Hyperopt optimizer

In [6]:
base_params = {'task': 'train', \
               'objective': 'binary', \
               'tree_learner': 'serial', \
               'num_threads': 4, \
               'device_type': 'cpu', \
               'seed': 1213, \
               'bagging_seed': 42, \
               'feature_fraction_seed': 3, \
               'first_metric_only': False, \
               'max_delta_step': 0, \
               'bin_construct_sample_cnt': 200000, \
               'histogram_pool_size': -1, \
               'is_unbalance': True, \
               'metric': 'auc,binary_logloss,binary_error', \
               'metric_freq': 1}

cat_params = {'boosting': ['gbdt']}

int_params = {'num_leaves': (2, 1024, 8), \
              'max_depth': (1, 100, 1), \
              'min_data_in_leaf': (2, 500, 4), \
              'bagging_freq': (0, 100, 1), \
              'min_data_per_group': (100, 500, 10), \
              'max_cat_threshold': (16, 256, 2), \
              'max_cat_to_onehot': (1, 100, 1), \
              'max_bin': (127, 511, 2), \
              'min_data_in_bin': (3, 128, 8)}

float_params = {'min_sum_hessian_in_leaf': (0, 0.1), \
                'bagging_fraction': (0.1, 1), \
                'pos_bagging_fraction': (0.1, 1), \
                'neg_bagging_fraction': (0.1, 1), \
                'feature_fraction': (0.1, 1), \
                'feature_fraction_bynode': (0.1, 1), \
                'lambda_l1': (0, 500), \
                'lambda_l2': (1000, 4000), \
                'sigmoid': (0.1, 500), \
                'cat_l2': (10, 1000), \
                'cat_smoth': (10, 1000), \
                'min_gain_to_split': (0, 100)}

In [None]:
lgb_hyperopt = lightgbm_hyperopt_binary(X_train, y_train, X_eval, y_eval, X_test, y_test, \
                                        base_params, cat_params, int_params, float_params, \
                                        num_opts=50, trials_path='./trials.pkl', load_trials=True)
best_params = lgb_hyperopt.optimize_lgb()