In [1]:
!pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/85/ee/2688cce5ced0597e12832d1ec4f4383a468f6bddff768eeaa3b5bf4f6500/optuna-1.3.0.tar.gz (163kB)
[K     |████████████████████████████████| 163kB 2.7MB/s 
[?25hCollecting alembic
[?25l  Downloading https://files.pythonhosted.org/packages/60/1e/cabc75a189de0fbb2841d0975243e59bde8b7822bacbb95008ac6fe9ad47/alembic-1.4.2.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 30.2MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting cliff
[?25l  Downloading https://files.pythonhosted.org/packages/b9/17/57187872842bf9f65815b6969b515528ec7fd754137d2d3f49e3bc016175/cliff-3.1.0-py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 11.2MB/s 
[?25hCollecting cmaes
  Downloading https://files.pythonhosted.org/packages/a6/9f/9fcd62076df5ff38ac4678e819b23f374973214d

In [0]:
# import dependent libraries

'''
#reference links
Hyperopt and Randomsearch with early stopping https://github.com/WillKoehrsen/hyperparameter-optimization 
Optuna tuner https://github.com/optuna/optuna/blob/master/examples/lightgbm_tuner_simple.py
'''

import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import RandomizedSearchCV

import csv
from timeit import default_timer as timer
from hyperopt import STATUS_OK, hp, tpe, Trials, fmin
from hyperopt.pyll.stochastic import sample
import random

import optuna.integration.lightgbm as lgbo
import optuna


In [0]:
# starting with storing the data as data frame
df = pd.read_csv("drive/My Drive/Colab Notebooks/train_test_files_sample.csv")
df.drop( columns='Unnamed: 0', inplace =True)

# making a smaller df for quick testing
df_s, _ = train_test_split(df, random_state = 30, train_size = 0.01)

train_X = df_s.drop(columns = '0')
train_y = df_s['0']

In [0]:
# drop last columns

def col_keep(df):
  return df.drop(columns = list(map(str,range(22,29))), inplace = True) # removing 7 last columns


In [0]:
MAX_EVALS = 5
N_FOLDS = 3

# Hyperopt

In [78]:
def objective(params, n_folds = N_FOLDS):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""
    
    # Keep track of evals
    global ITERATION
    
    ITERATION += 1
    
    # Retrieve the subsample if present otherwise set to 1.0
    subsample = params['boosting_type'].get('subsample', 1.0)
    
    # Extract the boosting type
    params['boosting_type'] = params['boosting_type']['boosting_type']
    params['subsample'] = subsample
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_data_in_leaf', 
                           'max_bin', 'bagging_freq']:
        params[parameter_name] = int(params[parameter_name])
    
    start = timer()
    
    # Perform n_folds cross validation
    cv_results = lgb.cv(params, train_set, num_boost_round = 10000, nfold = n_folds, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 50)
    
    run_time = timer() - start
    
    # Extract the best score
    best_score = np.max(cv_results['auc-mean'])
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)

    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, n_estimators, run_time])
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'estimators': n_estimators, 
            'train_time': run_time, 'status': STATUS_OK}


# Hyperopt Space
space = {              
            'num_leaves': hp.quniform('num_leaves', 16, 196, 4),

            'max_bin' : hp.quniform('max_bin', 254, 254, 1), #if using CPU just set this to 254

            'lambda_l1': hp.loguniform('lambda_l1', 1e-8, 10.0),
            
            'lambda_l2': hp.loguniform("lambda_l2", 1e-8, 10.0),
            
            'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 20, 500, 10),

            'class_weight': hp.choice('class_weight', [None, 'balanced']),
            
            'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                                         {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                                         {'boosting_type': 'goss', 'subsample': 1.0}]),

            'learning_rate' : hp.loguniform('learning_rate', np.log(0.05), np.log(0.25)),

            'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),

            'feature_fraction': hp.uniform('feature_fraction', 0.4, 1.0),
                     
            'bagging_freq': hp.uniform('bagging_freq', 1, 7),
                     
            'verbosity' : 0

        }

# optimization algorithm
tpe_algorithm = tpe.suggest

# Keep track of results
bayes_trials = Trials()

# File to save first results
out_file = 'gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()

# Global variable
global  ITERATION

ITERATION = 0

train_set = lgb.Dataset(data=train_X, label = train_y)

# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = 100, trials = bayes_trials, rstate = np.random.RandomState(50))

  1%|          | 1/100 [00:00<00:54,  1.81it/s, best loss: 0.5]


Early stopping is not available in dart mode




 10%|█         | 10/100 [00:45<02:56,  1.96s/it, best loss: 0.22370190630097253]


Early stopping is not available in dart mode




 14%|█▍        | 14/100 [01:23<06:42,  4.68s/it, best loss: 0.22370190630097253]


Early stopping is not available in dart mode




 19%|█▉        | 19/100 [12:08<1:04:03, 47.45s/it, best loss: 0.21762932573434635]


Early stopping is not available in dart mode




 20%|██        | 20/100 [12:46<51:05, 38.32s/it, best loss: 0.21762932573434635]


AssertionError: ignored

# Random Search

In [38]:
def random_objective(params, iteration, n_folds = N_FOLDS):
    """Random search objective function. Takes in hyperparameters
       and returns a list of results to be saved."""

    start = timer()
    # Subsampling (only applicable with 'goss')
    subsample_dist = list(np.linspace(0.5, 1, 100))

    if params['boosting_type'] == 'goss':
        # Cannot subsample with goss
        params['subsample'] = 1.0
    else:
        # Subsample supported for gdbt and dart
        params['subsample'] = random.sample(subsample_dist, 1)[0]    

    # Perform n_folds cross validation
    cv_results = lgb.cv(params, train_set, num_boost_round = 1000, nfold = n_folds, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 50)
    end = timer()
    best_score = np.max(cv_results['auc-mean'])
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)
    
    # Return list of results
    return [loss, params, iteration, n_estimators, end - start]

# Hyperparameter grid
param_grid = {
                'num_leaves': list(range(16, 196, 4)),
              
                'max_bin': [254],

                'lambda_l1': list(np.linspace(0, 1)),

                'lambda_l2': list(np.linspace(0, 1)),

                'min_data_in_leaf' : list(range(20, 500, 10)),

                'class_weight': [None, 'balanced'],

                'boosting_type': ['gbdt', 'goss', 'dart'],
              
                'learning_rate' : list(np.logspace(np.log(0.05), np.log(0.2), base = np.exp(1), num = 1000)),

                'feature_fraction': list(np.linspace(0.4, 1.0)),

                'bagging_freq': list(range(1,7)),

                'verbosity' : [0]
                }

random.seed(50)

# Dataframe to hold cv results
random_results = pd.DataFrame(columns = ['loss', 'params', 'iteration', 'estimators', 'time'],
                       index = list(range(MAX_EVALS)))

# Iterate through the specified number of evaluations
for i in range(15):
    
    # Randomly sample parameters for gbm
    params = {key: random.sample(value, 1)[0] for key, value in param_grid.items()}
    
    print(params)  
        
    results_list = random_objective(params, i)
    
    # Add results to next row in dataframe
    random_results.loc[i, :] = results_list

{'num_leaves': 140, 'max_bin': 254, 'lambda_l1': 0.4693877551020408, 'lambda_l2': 0.8163265306122448, 'min_data_in_leaf': 170, 'class_weight': 'balanced', 'boosting_type': 'goss', 'learning_rate': 0.05641591746163987, 'feature_fraction': 0.8163265306122449, 'bagging_freq': 3, 'verbosity': 0}
{'num_leaves': 188, 'max_bin': 254, 'lambda_l1': 0.18367346938775508, 'lambda_l2': 0.44897959183673464, 'min_data_in_leaf': 80, 'class_weight': 'balanced', 'boosting_type': 'goss', 'learning_rate': 0.06851322752809227, 'feature_fraction': 0.5469387755102041, 'bagging_freq': 1, 'verbosity': 0}
{'num_leaves': 168, 'max_bin': 254, 'lambda_l1': 0.8163265306122448, 'lambda_l2': 0.1020408163265306, 'min_data_in_leaf': 60, 'class_weight': 'balanced', 'boosting_type': 'dart', 'learning_rate': 0.12460114426637409, 'feature_fraction': 0.7306122448979592, 'bagging_freq': 1, 'verbosity': 0}



Early stopping is not available in dart mode



{'num_leaves': 84, 'max_bin': 254, 'lambda_l1': 0.8979591836734693, 'lambda_l2': 0.12244897959183673, 'min_data_in_leaf': 140, 'class_weight': 'balanced', 'boosting_type': 'goss', 'learning_rate': 0.18098320907845356, 'feature_fraction': 0.9387755102040817, 'bagging_freq': 5, 'verbosity': 0}
{'num_leaves': 48, 'max_bin': 254, 'lambda_l1': 0.7959183673469387, 'lambda_l2': 0.26530612244897955, 'min_data_in_leaf': 360, 'class_weight': None, 'boosting_type': 'goss', 'learning_rate': 0.13171296819007705, 'feature_fraction': 0.8775510204081632, 'bagging_freq': 6, 'verbosity': 0}
{'num_leaves': 176, 'max_bin': 254, 'lambda_l1': 0.5510204081632653, 'lambda_l2': 0.673469387755102, 'min_data_in_leaf': 360, 'class_weight': None, 'boosting_type': 'goss', 'learning_rate': 0.11244107634796272, 'feature_fraction': 0.8040816326530612, 'bagging_freq': 3, 'verbosity': 0}
{'num_leaves': 108, 'max_bin': 254, 'lambda_l1': 0.0, 'lambda_l2': 0.44897959183673464, 'min_data_in_leaf': 20, 'class_weight': 'balan


Early stopping is not available in dart mode



{'num_leaves': 124, 'max_bin': 254, 'lambda_l1': 0.6938775510204082, 'lambda_l2': 0.18367346938775508, 'min_data_in_leaf': 160, 'class_weight': None, 'boosting_type': 'dart', 'learning_rate': 0.11803700655197771, 'feature_fraction': 0.6204081632653061, 'bagging_freq': 6, 'verbosity': 0}



Early stopping is not available in dart mode



{'num_leaves': 96, 'max_bin': 254, 'lambda_l1': 0.18367346938775508, 'lambda_l2': 0.6938775510204082, 'min_data_in_leaf': 320, 'class_weight': 'balanced', 'boosting_type': 'gbdt', 'learning_rate': 0.15731471815103718, 'feature_fraction': 0.48571428571428577, 'bagging_freq': 3, 'verbosity': 0}


# Optuna

In [81]:
def objective(trial):
    
    dtrain = lgbo.Dataset(train_X, label=train_y)

    global ITERATION_O

    ITERATION_O += 1

    param ={              
            'num_leaves': trial.suggest_int('num_leaves', 16, 196, 4),

            'max_bin' : trial.suggest_uniform('max_bin', 254, 254), #if using CPU just set this to 254

            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            
            'lambda_l2': trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
            
            'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 20, 500),

            'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
            
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),

            'learning_rate' : trial.suggest_loguniform('learning_rate', 0.05, 0.25),

            'feature_fraction': trial.suggest_uniform("feature_fraction", 0.4, 1.0),
                
            'bagging_freq': trial.suggest_int("bagging_freq", 1, 7),
                     
            'verbosity' : 0

        }
    
    start = timer()
    # Perform n_folds cross validation
    if param['boosting_type'] == 'goss':
      param['subsample'] = 1
    else:
      param['subsample'] = trial.suggest_uniform('subsample', 0.5, 1)
    
    cv_results = lgb.cv(params, train_set, num_boost_round = 10000, nfold = 3, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 50)
    
    run_time = timer() - start
    
    # Extract the best score
    best_score = np.max(cv_results['auc-mean'])

    loss = 1 - best_score

    # Boosting rounds that returned the highest cv score
    n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)

    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, param, ITERATION, n_estimators, run_time])

    
    return loss

# Global variable
global  ITERATION_O

ITERATION_O = 0

# File to save first results
out_file = 'gbm_optuna.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()

study = optuna.create_study(direction='minimize')

study.optimize(objective, n_trials=5000)

[32m[I 2020-04-02 17:57:25,374][0m Finished trial#0 with value: 0.22092658226241424 with parameters: {'num_leaves': 60, 'max_bin': 254, 'lambda_l1': 2.4569464506213583e-06, 'lambda_l2': 0.00180884407558789, 'min_data_in_leaf': 91, 'class_weight': None, 'boosting_type': 'gbdt', 'learning_rate': 0.19258450886769526, 'feature_fraction': 0.8148468683250664, 'bagging_freq': 7, 'subsample': 0.6048199922164568}. Best is trial#0 with value: 0.22092658226241424.[0m
[32m[I 2020-04-02 17:57:26,966][0m Finished trial#1 with value: 0.22092658226241424 with parameters: {'num_leaves': 156, 'max_bin': 254, 'lambda_l1': 2.0978256504871975e-07, 'lambda_l2': 2.218610749344635e-08, 'min_data_in_leaf': 331, 'class_weight': 'balanced', 'boosting_type': 'gbdt', 'learning_rate': 0.1422788757978464, 'feature_fraction': 0.8053325542084108, 'bagging_freq': 7, 'subsample': 0.6441387756094226}. Best is trial#0 with value: 0.22092658226241424.[0m
[32m[I 2020-04-02 17:57:28,575][0m Finished trial#2 with valu

KeyboardInterrupt: ignored

# LGBM Class

In [0]:
class LightGBM_Class():
  def __init__(self, df, target):
    self.df = df
    self.target = target
    self.train_set = lgb.Dataset(data=self.df, label = self.target)

  
  def lgbm_classifier(self, num_round = 10, nfold = 5):
    self.param = {'num_leaves': 31, 'objective': 'binary'}
    self.param['metric'] = 'auc'
    self.num_round =  num_round
    self.n_fold = nfold
    bst = lgb.LGBMClassifier()
    bst.fit(self.df, self.target)
    return bst
 



In [0]:
test = LightGBM_Class(train_X, train_y)

In [0]:
model = test.lgbm_classifier()

In [0]:
model.save_model('model.txt', num_iteration = model.best_iteration)

In [105]:
model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [0]:
ml = lgb.(params, train_set)



In [109]:
params

{'bagging_freq': 3,
 'boosting_type': 'gbdt',
 'class_weight': 'balanced',
 'feature_fraction': 0.48571428571428577,
 'lambda_l1': 0.18367346938775508,
 'lambda_l2': 0.6938775510204082,
 'learning_rate': 0.15731471815103718,
 'max_bin': 254,
 'min_data_in_leaf': 320,
 'num_leaves': 96,
 'subsample': 0.9595959595959596,
 'verbosity': 0}