In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from bayes_opt import BayesianOptimization
import lightgbm as lgb

import os

from sklearn.cluster import KMeans

from sklearn.neighbors import DistanceMetric

from xgboost import XGBClassifier

In [2]:
# control parameters
nfolds = 5

# Data

In [3]:
xtrain = pd.read_csv("../input/train.csv")
xtest = pd.read_csv("../input/test.csv")

In [4]:
# separate the data
id_train = xtrain['ID_code']
ytrain = xtrain['target']
id_test = xtest['ID_code']
xtrain.drop(['ID_code', 'target'], axis = 1, inplace = True)
xtest.drop('ID_code', axis = 1, inplace = True)

In [5]:
folds = KFold(n_splits= nfolds, shuffle=True, random_state= 15)

mindex  = np.zeros((len(xtrain),1) )

for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain.values, ytrain.values)):
#    print('----')
    print("fold n°{}".format(fold_))
    
    mindex[val_idx, 0] = fold_

xfolds = pd.DataFrame()
xfolds['MachineIdentifier'] = id_train
xfolds['fold_id']= mindex
xfolds['fold_id'] = xfolds['fold_id'].astype(int).round(0)
xfolds.to_csv('xfolds.csv', index = False)

fold n°0
fold n°1
fold n°2
fold n°3
fold n°4


# FE

## Clustering

In [6]:
kmeans = KMeans(init='k-means++', n_clusters= 10, n_init=10)
kmeans.fit(xtrain)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [7]:
# distance of each observation from cluster centers
dist = DistanceMetric.get_metric('euclidean')
ax_tr = dist.pairwise(xtrain, kmeans.cluster_centers_)
ax_te = dist.pairwise(xtest, kmeans.cluster_centers_)

In [8]:
# format into dataframe
ax_tr = pd.DataFrame(ax_tr)
ax_te = pd.DataFrame(ax_te)
xcols =  ['dist' + str(f) for f in range(0, ax_tr.shape[1])]

In [9]:
ax_tr.columns = xcols
ax_te.columns = xcols

## Summary statistics

In [10]:
m1 = xtrain.max(axis = 1)
m2 = xtrain.min(axis = 1)
m3 = xtrain.median(axis = 1)
m4 = 1/xtrain.std(axis = 1)
m5 = 1/xtrain.mad(axis = 1)

xtrain['xmax'] = m1; xtrain['xmin'] = m2; xtrain['xmed'] = m3; xtrain['xstd'] = m4

m1 = xtest.max(axis = 1)
m2 = xtest.min(axis = 1)
m3 = xtest.median(axis = 1)
m4 = 1/xtest.std(axis = 1)
m5 = 1/xtest.mad(axis = 1)

xtest['xmax'] = m1; xtest['xmin'] = m2; xtest['xmed'] = m3; xtest['xstd'] = m4


In [11]:
# combine
xtrain = pd.concat([xtrain, ax_tr], axis = 1)
xtest = pd.concat([xtest, ax_te], axis = 1)

In [12]:
xtrain.head(3)

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,...,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,xmax,xmin,xmed,xstd,dist0,dist1,dist2,dist3,dist4,dist5,dist6,dist7,dist8,dist9
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,2.9252,3.1821,14.0137,0.5745,8.7989,14.5691,5.7487,-7.2393,4.284,30.7133,10.535,16.2191,2.5791,2.4716,14.3831,13.4325,-5.1488,-0.4073,4.9306,5.9965,-0.3085,12.9041,-3.8766,16.8911,11.192,10.5785,0.6764,7.8871,4.6667,3.8743,...,18.5618,7.7423,-10.1245,13.7241,-3.5189,1.7202,-8.4051,9.0164,3.0657,14.3691,25.8398,5.8764,11.8411,-19.7159,17.5743,0.5857,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914,43.1127,-21.4494,6.7704,0.107163,75.275861,82.578509,85.218292,80.714147,82.625528,74.696624,74.624408,74.500807,79.43526,77.726521
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,-0.4032,8.0585,14.0239,8.4135,5.4345,13.7003,13.8275,-15.5849,7.8,28.5708,3.4287,2.7407,8.5524,3.3716,6.9779,13.891,-11.7684,-2.5586,5.0464,0.5481,-9.2987,7.8755,1.2859,19.371,11.3702,0.7399,2.7995,5.8434,10.816,3.6783,...,30.2645,10.4968,-7.2352,16.5721,-7.3477,11.0752,-5.5937,9.4878,-14.91,9.4245,22.5441,-4.8622,7.6543,-15.9319,13.3175,-0.3566,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518,40.5632,-47.3797,7.22315,0.096748,107.150707,101.684331,84.309732,82.807817,83.028558,89.490935,93.695367,94.982223,95.812135,90.644216
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,-0.3249,-11.2648,14.1929,7.3124,7.5244,14.6472,7.6782,-1.7395,4.7011,20.4775,17.7559,18.1377,1.2145,3.5137,5.6777,13.2177,-7.994,-2.9029,5.8463,6.1439,-11.1025,12.4858,-2.2871,19.0422,11.0449,4.1087,4.6974,6.9346,10.8917,0.9003,...,25.682,10.9202,-0.3104,8.8438,-9.7009,2.4013,-4.2935,9.3908,-13.2648,3.1545,23.0866,-5.3,5.3745,-6.266,10.1934,-0.8417,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965,33.882,-22.4038,5.8994,0.114241,72.238873,73.439649,75.634427,77.209975,74.879101,73.866614,68.79239,73.877706,71.461238,68.719158


# Parameter tuning

In [13]:
def lgbcv(learning_rate, subsample, min_child_samples, max_depth,
                  colsample_bytree, min_child_weight, min_split_gain, 
                  lambda_l1, lambda_l2,bagging_freq, num_leaves,
                  silent=True, seed=1234):

    params = {                        
            'boosting_type': 'gbdt','objective': 'binary', 'metric':'auc',
            'max_depth': -1, 'num_leaves': int(num_leaves),
            'learning_rate': learning_rate, 'max_depth': int(max_depth),
            'min_child_samples': int(min_child_samples), 
           'subsample': subsample, 'colsample_bytree': colsample_bytree, 'bagging_seed': 11,
           'min_child_weight': min_child_weight,  'bagging_freq' : int(bagging_freq),
           'min_split_gain': min_split_gain,'lambda_l1': lambda_l1,'lambda_l2': lambda_l2,
           'nthread': 8
        }

                
    bst1 = lgb.train(params, trn_data, valid_sets=[trn_data, val_data], valid_names=['train','valid'],
                          num_boost_round= 5000, verbose_eval= 5000, early_stopping_rounds = 100)
    
    ypred = bst1.predict(x1)

    loss = roc_auc_score(y1, ypred)
    return loss

In [14]:
## find optimal params
param_list = list()
score_list = list()


for fold_, (trn_idx, val_idx) in enumerate(folds.split(xtrain.values, ytrain.values)):
    print('----')
    print("fold n°{}".format(fold_))
    
    x0,y0 = xtrain.iloc[trn_idx], ytrain[trn_idx]
    x1,y1 = xtrain.iloc[val_idx], ytrain[val_idx]
    
    
    trn_data = lgb.Dataset(x0, label= y0); val_data = lgb.Dataset(x1, label= y1)
    # optimization
    lgbBO = BayesianOptimization(lgbcv, {'learning_rate': (0.0025, 0.05),'max_depth': (int(5), int(15)),
                                           'min_child_samples': (int(25), int(250)),'subsample': (0.2, 0.95),
                                            'colsample_bytree': (0.2, 0.95), 'min_child_weight': (int(1), int(150)),
                                            'min_split_gain': (0.1, 2),'num_leaves': (int(15),int(200)),
                                             'lambda_l1': (10, 200),'lambda_l2': (10, 200),
                                             'bagging_freq': (1,20)
                                                   })
    lgbBO.maximize(init_points= 25, n_iter= 45, xi=0.06)
    print('-' * 53)
    print('Final Results')
    print('LGB: %f' % lgbBO.res['max']['max_val']);  print('LGB: %s' % lgbBO.res['max']['max_params'])

    param_list.append(lgbBO.res['max']['max_params'])
    score_list.append(lgbBO.res['max']['max_val'])

----
fold n°0
|   iter    |  target   | baggin... | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_ch... | min_sp... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[2022]	train's auc: 0.877751	valid's auc: 0.857724
| [0m 1       [0m | [0m 0.8577  [0m | [0m 8.75    [0m | [0m 0.9034  [0m | [0m 157.6   [0m | [0m 170.7   [0m | [0m 0.01819 [0m | [0m 9.137   [0m | [0m 197.0   [0m | [0m 147.5   [0m | [0m 1.005   [0m | [0m 41.88   [0m | [0m 0.3509  [0m |
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[828]	train's auc: 0.873127	valid's auc: 0.858794
| [95m 2       [0m | [95m 0.8588  [0m | [95m 10.88   [0m | [95m 0.4606  [0m | [95m 100.0   [0m | [95

TypeError: list indices must be integers or slices, not str