In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from skopt import BayesSearchCV
from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# Local libs
from fraud_detection.models import regression as reg
from fraud_detection.main import preprocessing as pp

# notebook settings
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
%matplotlib inline


df = pp().pickle_load(fname='dfPickle_2')
(x_train, y_train) = pp().pickle_load(fname='dfPickle_2_train')
(x_test, y_test) = pp().pickle_load(fname='dfPickle_2_test')

Invoking __init__.py for fraud_detection


In [8]:
x_train.head()

Unnamed: 0,index,customerId,creditLimit,transactionAmount,merchantName,posEntryMode,posConditionCode,cardPresent,expirationDateKeyInMatch,accountOpenMonthNum
45218,45218,1729,0.419384,0.737737,2485,0,0,0,0,-16
384138,384138,2847,0.0,0.246366,550,0,0,1,0,-31
192970,192970,2048,0.659235,0.571621,2475,2,0,0,0,-11
251644,251644,1557,0.419384,0.566447,158,0,0,0,0,-23
728475,728475,3906,0.659235,0.361137,2086,0,0,0,0,-86


In [10]:
%%time
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, 
                            n_folds=5, random_seed=6,
                            n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, 
                 bagging_fraction, max_depth, max_bin, 
                 min_data_in_leaf, min_sum_hessian_in_leaf, subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
                
        cv_result = lgb.cv(params, train_data, nfold=n_folds, 
                           seed=random_seed, stratified=True, 
                           verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.001, 0.2),
                                            'num_leaves': (25, 60),
                                            'feature_fraction': (0.1, 1),
                                            'bagging_fraction': (0.5, 1),
                                           'max_depth': (2, 20),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=200)


    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

opt_params = bayes_parameter_opt_lgb(x_train, y_train, init_round=5, opt_round=10, n_folds=5, random_seed=6,n_estimators=10000)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 6938, number of negative: 6938
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 13876, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 13877, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=tru



| [0m 1       [0m | [0m 0.8026  [0m | [0m 0.9738  [0m | [0m 0.3039  [0m | [0m 0.1193  [0m | [0m 49.98   [0m | [0m 15.75   [0m | [0m 20.17   [0m | [0m 35.74   [0m | [0m 56.84   [0m | [0m 0.4615  [0m |
[LightGBM] [Info] Number of positive: 6938, number of negative: 6938
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54
[LightGBM] [Info] Number of data points in the train set: 13876, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54
[LightGBM] [Info] Number of data points in the train set: 13877, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not 



| [95m 2       [0m | [95m 0.8032  [0m | [95m 0.9909  [0m | [95m 0.8806  [0m | [95m 0.1972  [0m | [95m 84.63   [0m | [95m 7.466   [0m | [95m 70.77   [0m | [95m 12.12   [0m | [95m 52.5    [0m | [95m 0.258   [0m |
[LightGBM] [Info] Number of positive: 6938, number of negative: 6938
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 13876, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 13877, number of used features: 9


[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 13877, number of used features: 9
[LightGBM] [Info] Number of positive: 6939, number of negative: 6938
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 13877, number of used features: 9
[LightGBM] [Info] Number of positive: 6939, number of negative: 6938
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 13877, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.50





| [95m 3       [0m | [95m 0.8156  [0m | [95m 0.548   [0m | [95m 0.9491  [0m | [95m 0.1654  [0m | [95m 56.28   [0m | [95m 17.72   [0m | [95m 54.7    [0m | [95m 45.01   [0m | [95m 48.81   [0m | [95m 0.4252  [0m |
[LightGBM] [Info] Number of positive: 6938, number of negative: 6938
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 13876, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 13877, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points

| [0m 4       [0m | [0m 0.7767  [0m | [0m 0.8202  [0m | [0m 0.6478  [0m | [0m 0.02198 [0m | [0m 87.62   [0m | [0m 15.66   [0m | [0m 60.78   [0m | [0m 32.93   [0m | [0m 25.93   [0m | [0m 0.8056  [0m |
[LightGBM] [Info] Number of positive: 6938, number of negative: 6938
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 41
[LightGBM] [Info] Number of data points in the train set: 13876, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 41
[LightGBM] [Info] Number of data points in the train set: 13877, number of used features: 9
[LightGBM] [Info] Number of positive: 6938, number of negative: 6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not 





| [0m 5       [0m | [0m 0.7656  [0m | [0m 0.9864  [0m | [0m 0.3546  [0m | [0m 0.1302  [0m | [0m 38.59   [0m | [0m 5.378   [0m | [0m 45.14   [0m | [0m 66.6    [0m | [0m 43.11   [0m | [0m 0.8559  [0m |


TypeError: 'float' object is not subscriptable