# Read-me

- For this project, we wanto to build a ensemble method from a combination of LightGBMClassifier, Random Forests and a Logistic Regressor. 
- To do this, firstly, we need to optimize our parameters. 
- We use bayesian optimization as it showed better than grid search (in relation to time).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.model_selection import train_test_split
from lib.utils import *

In [5]:
!pip install lightgbm
!pip install scikit-optimize



# Load data

In [3]:
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [20]:
X_train = np.load('data/input/X_train.npy')
y_train = np.load('data/input/y_train.npy')
X_test = np.load('data/input/X_test.npy')
y_test = np.load('data/input/y_test.npy')

# Bayesian Optimization

## LGBM

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.10, random_state=42)

In [23]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [43]:
from skopt import forest_minimize
from lightgbm import LGBMClassifier

def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=6)
    mdl.fit(X_train, y_train)
    
    p = mdl.predict_proba(X_valid)[:, 1]
    
    print(roc_auc_score(y_valid, p))
    
    return -average_precision_score(y_valid, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          ] 

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272]
0.9669884624440781
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.2610
Function value obtained: -0.8507
Current minimum: -0.8507
Iteration No: 2 started. Evaluating function at random point.
[0.0010385556240017917, 2, 10, 0.14183771058242609, 0.7437489153990157, 249]
0.9467859665646338
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0823
Function value obtained: -0.7306
Current minimum: -0.8507
Iteration No: 3 started. Evaluating function at random point.
[0.00209745522423282, 5, 6, 0.1541824778996655, 0.8682075103820793, 273]
0.9648928655521545
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1297
Function value obtained: -0.8292
Current minimum: -0.8507
Iteration No: 4 started. Evaluating function at random point.
[0.016490254525097375, 9, 9, 0.6502182010234373, 0.6866210554187129, 82

In [44]:
params = res.x

In [46]:
import pickle

pickle.dump(params,open('data/params/lgbm.pickle','wb'))

## RandomForestClassifier

In [47]:
from sklearn.ensemble import RandomForestClassifier
def tune_rf(params):
    print(params)
    
    bootstrap = params[0]
    max_depth = params[1]
    max_features = params[2]
    min_samples_leaf = params[3]
    min_samples_split = params[4]
    n_estimators = params[5]

    
    
    mdl = RandomForestClassifier(bootstrap=bootstrap, max_depth=max_depth, \
     max_features=max_features, min_samples_leaf=min_samples_leaf, \
        min_samples_split=min_samples_split, n_estimators=n_estimators,    \
                                 random_state=0, 
                         class_weight="balanced", n_jobs=-1)
    mdl.fit(X_train, y_train)
    
    p = mdl.predict_proba(X_valid)[:, 1]
    
    print(roc_auc_score(y_valid, p))
    
    return -average_precision_score(y_valid, p)


space = {'bootstrap': [True, False],
 'max_depth': [10, 20, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 4],
 'min_samples_split': [2, 10],
 'n_estimators': [200, 1000]}

space = list(space.values())
print(space)

res = forest_minimize(tune_rf, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

[[True, False], [10, 20, None], ['auto', 'sqrt'], [1, 4], [2, 10], [200, 1000]]
Iteration No: 1 started. Evaluating function at random point.
[True, 20, 'sqrt', 3, 9, 208]
0.9678832116788321
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.4848
Function value obtained: -0.8488
Current minimum: -0.8488
Iteration No: 2 started. Evaluating function at random point.
[True, 20, 'auto', 3, 7, 700]
0.9683659053449494
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.5672
Function value obtained: -0.8498
Current minimum: -0.8498
Iteration No: 3 started. Evaluating function at random point.
[False, None, 'sqrt', 4, 5, 883]
0.9661408052743112
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 2.2167
Function value obtained: -0.8358
Current minimum: -0.8498
Iteration No: 4 started. Evaluating function at random point.
[True, 20, 'sqrt', 3, 10, 592]
0.968283494231222
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.2496

In [48]:
params = res.x

In [49]:
import pickle

pickle.dump(params,open('data/params/rf.pickle','wb'))

## Logistic Regressor

In [50]:
from sklearn.linear_model import LogisticRegression
def tune_lr(params):
    print(params)
    
    C = params[0]
    solver = params[1]

    mdl = LogisticRegression(C=C, solver=solver,\
                         class_weight="balanced", n_jobs=-1)
    mdl.fit(X_train, y_train)
    
    p = mdl.predict_proba(X_valid)[:, 1]
    
    print(roc_auc_score(y_valid, p))
    
    return -average_precision_score(y_valid, p)


space = {'C': [1, 10],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
        }

space = list(space.values())
print(space)

res = forest_minimize(tune_lr, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

[[1, 10], ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']]
Iteration No: 1 started. Evaluating function at random point.
[5, 'lbfgs']
0.9375441488109254
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.7396
Function value obtained: -0.6452
Current minimum: -0.6452
Iteration No: 2 started. Evaluating function at random point.
[1, 'liblinear']
0.9368259948198729
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0517
Function value obtained: -0.6437
Current minimum: -0.6452
Iteration No: 3 started. Evaluating function at random point.
[8, 'sag']




0.9315045914763362
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1773
Function value obtained: -0.6171
Current minimum: -0.6452
Iteration No: 4 started. Evaluating function at random point.
[9, 'saga']
0.9284906993171651
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.1655
Function value obtained: -0.6018
Current minimum: -0.6452
Iteration No: 5 started. Evaluating function at random point.
[1, 'sag']




0.9314574994113491
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.1535
Function value obtained: -0.6164
Current minimum: -0.6452
Iteration No: 6 started. Evaluating function at random point.
[6, 'newton-cg']
0.9376030138921592
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.4181
Function value obtained: -0.6454
Current minimum: -0.6454
Iteration No: 7 started. Evaluating function at random point.
[5, 'saga']
0.9285260183659053
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.1957
Function value obtained: -0.6020
Current minimum: -0.6454
Iteration No: 8 started. Evaluating function at random point.
[4, 'saga']




0.9284906993171651
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.1786
Function value obtained: -0.6018
Current minimum: -0.6454
Iteration No: 9 started. Evaluating function at random point.
[9, 'lbfgs']
0.9376736519896398
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.4503
Function value obtained: -0.6457
Current minimum: -0.6457
Iteration No: 10 started. Evaluating function at random point.
[9, 'sag']
0.931516364492583
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.1655
Function value obtained: -0.6175
Current minimum: -0.6457
Iteration No: 11 started. Evaluating function at random point.
[9, 'sag']




0.9315399105250766
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.1540
Function value obtained: -0.6170
Current minimum: -0.6457
Iteration No: 12 started. Evaluating function at random point.
[10, 'sag']
0.931516364492583
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.1857
Function value obtained: -0.6170
Current minimum: -0.6457
Iteration No: 13 started. Evaluating function at random point.
[10, 'saga']




0.9284906993171651
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.2148
Function value obtained: -0.6018
Current minimum: -0.6457
Iteration No: 14 started. Evaluating function at random point.
[8, 'lbfgs']
0.9376736519896397
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.4417
Function value obtained: -0.6456
Current minimum: -0.6457
Iteration No: 15 started. Evaluating function at random point.
[6, 'newton-cg']
0.9376030138921592
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.4314
Function value obtained: -0.6454
Current minimum: -0.6457
Iteration No: 16 started. Evaluating function at random point.
[1, 'lbfgs']
0.9369084059336001
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.4476
Function value obtained: -0.6437
Current minimum: -0.6457
Iteration No: 17 started. Evaluating function at random point.
[10, 'newton-cg']
0.93770897103838
Iteration No: 17 ended. Evaluation done at random point.
T



Iteration No: 20 ended. Evaluation done at random point.
Time taken: 0.2348
Function value obtained: -0.6438
Current minimum: -0.6459
Iteration No: 21 started. Searching for the next optimal point.
[3, 'newton-cg']
0.9373322345184836
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.3593
Function value obtained: -0.6439
Current minimum: -0.6459
Iteration No: 22 started. Searching for the next optimal point.
[4, 'lbfgs']
0.9375088297621851
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.2642
Function value obtained: -0.6448
Current minimum: -0.6459
Iteration No: 23 started. Searching for the next optimal point.
[3, 'newton-cg']
0.9373322345184836




Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.2664
Function value obtained: -0.6439
Current minimum: -0.6459
Iteration No: 24 started. Searching for the next optimal point.
[5, 'liblinear']
0.9374852837296915




Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.2399
Function value obtained: -0.6450
Current minimum: -0.6459
Iteration No: 25 started. Searching for the next optimal point.
[9, 'liblinear']
0.9376736519896398




Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.2306
Function value obtained: -0.6456
Current minimum: -0.6459
Iteration No: 26 started. Searching for the next optimal point.
[7, 'liblinear']
0.9375794678596657




Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.2329
Function value obtained: -0.6452
Current minimum: -0.6459
Iteration No: 27 started. Searching for the next optimal point.
[10, 'liblinear']
0.9376854250058865




Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 0.2331
Function value obtained: -0.6457
Current minimum: -0.6459
Iteration No: 28 started. Searching for the next optimal point.
[4, 'newton-cg']
0.9375206027784319
Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.2647
Function value obtained: -0.6449
Current minimum: -0.6459
Iteration No: 29 started. Searching for the next optimal point.
[2, 'lbfgs']
0.9372027313397693
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.2623
Function value obtained: -0.6439
Current minimum: -0.6459
Iteration No: 30 started. Searching for the next optimal point.
[4, 'liblinear']
0.9374028726159642




Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 0.2325
Function value obtained: -0.6443
Current minimum: -0.6459
Iteration No: 31 started. Searching for the next optimal point.
[6, 'liblinear']
0.9375323757946786




Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.2318
Function value obtained: -0.6452
Current minimum: -0.6459
Iteration No: 32 started. Searching for the next optimal point.
[4, 'liblinear']
0.9374028726159642




Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.3207
Function value obtained: -0.6443
Current minimum: -0.6459
Iteration No: 33 started. Searching for the next optimal point.
[8, 'liblinear']
0.9376501059571463




Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.2502
Function value obtained: -0.6454
Current minimum: -0.6459
Iteration No: 34 started. Searching for the next optimal point.
[2, 'newton-cg']
0.937214504356016
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.2597
Function value obtained: -0.6439
Current minimum: -0.6459
Iteration No: 35 started. Searching for the next optimal point.
[3, 'lbfgs']
0.9373204615022369
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.2813
Function value obtained: -0.6439
Current minimum: -0.6459
Iteration No: 36 started. Searching for the next optimal point.
[4, 'newton-cg']
0.9375206027784319




Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.2732
Function value obtained: -0.6449
Current minimum: -0.6459
Iteration No: 37 started. Searching for the next optimal point.
[1, 'newton-cg']
0.9369084059336001
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.2790
Function value obtained: -0.6437
Current minimum: -0.6459
Iteration No: 38 started. Searching for the next optimal point.
[4, 'newton-cg']
0.9375206027784319




Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.2739
Function value obtained: -0.6449
Current minimum: -0.6459
Iteration No: 39 started. Searching for the next optimal point.
[5, 'liblinear']
0.9374852837296915




Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.2504
Function value obtained: -0.6450
Current minimum: -0.6459
Iteration No: 40 started. Searching for the next optimal point.
[9, 'newton-cg']
0.9376971980221334
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.2689
Function value obtained: -0.6458
Current minimum: -0.6459
Iteration No: 41 started. Searching for the next optimal point.
[7, 'lbfgs']
0.9376383329408994
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.2953
Function value obtained: -0.6455
Current minimum: -0.6459
Iteration No: 42 started. Searching for the next optimal point.
[4, 'lbfgs']
0.9375088297621851




Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.3650
Function value obtained: -0.6448
Current minimum: -0.6459
Iteration No: 43 started. Searching for the next optimal point.
[3, 'lbfgs']
0.9373204615022369




Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.2690
Function value obtained: -0.6439
Current minimum: -0.6459
Iteration No: 44 started. Searching for the next optimal point.
[4, 'lbfgs']
0.9375088297621851




Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.2906
Function value obtained: -0.6448
Current minimum: -0.6459
Iteration No: 45 started. Searching for the next optimal point.
[4, 'liblinear']
0.9374028726159642




Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.2398
Function value obtained: -0.6443
Current minimum: -0.6459
Iteration No: 46 started. Searching for the next optimal point.
[2, 'newton-cg']
0.937214504356016




Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 0.2829
Function value obtained: -0.6439
Current minimum: -0.6459
Iteration No: 47 started. Searching for the next optimal point.
[8, 'newton-cg']
0.9376736519896397
Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.3021
Function value obtained: -0.6456
Current minimum: -0.6459
Iteration No: 48 started. Searching for the next optimal point.
[7, 'newton-cg']
0.9376618789733929
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 0.2826
Function value obtained: -0.6455
Current minimum: -0.6459
Iteration No: 49 started. Searching for the next optimal point.
[10, 'newton-cg']
0.93770897103838




Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 0.2922
Function value obtained: -0.6459
Current minimum: -0.6459
Iteration No: 50 started. Searching for the next optimal point.
[7, 'sag']
0.931516364492583




Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 0.4119
Function value obtained: -0.6175
Current minimum: -0.6459


In [51]:
params = res.x

import pickle

pickle.dump(params,open('data/params/lr.pickle','wb'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
