In [16]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold

In [17]:
df=pd.read_csv("train.csv")
df_test=pd.read_csv("test.csv")
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [18]:
df.bin_1.value_counts(),df.nom_0.value_counts()

(0.0    474018
 1.0    107979
 Name: bin_1, dtype: int64, Red      323286
 Blue     205861
 Green     52601
 Name: nom_0, dtype: int64)

In [19]:
df.target.value_counts()

0    487677
1    112323
Name: target, dtype: int64

In [20]:
class categoricalfeature:
    def __init__(self,df,cat_feat,encoding_type,handle_NAN=False):
        #df pandas frames
        #cat_feature: list of categorical feature
        #encoding_type: lable,binary,one_hot
        self.df=df
        self.output_df=self.df.copy(deep=True)
        self.cat_feat=cat_feat
        self.encoding_type=encoding_type
        self.label_encoders=dict()
        
        if handle_NAN:
            for c in self.cat_feat:
                self.df.loc[:,c]= self.df.loc[:,c].astype(str).fillna('-99999')

    def _label_endcoding(self):
        for c in self.cat_feat:
            lbl=preprocessing.LabelEncoder()
            lbl.fit(self.df[c].values)
            self.output_df.loc[:,c]=lbl.transform(self.df[c].values)
            self.label_encoders[c]=lbl
        return self.output_df
            
    def transform(self):
        if self.encoding_type=='label':
            return self._label_endcoding()
        else:
            raise Exception("Encoding type not understand")

In [21]:
cols=[c for c in df.columns if c not in ['id','target']]
cat=categoricalfeature(df,cat_feat=cols,encoding_type='label',handle_NAN=True)
catt=categoricalfeature(df_test,cat_feat=cols,encoding_type='label',handle_NAN=True)
encoded_df=cat.transform()
encoded_df_test=catt.transform()

In [22]:
encoded_df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,0,0,2,4,3,5,...,27,2,0,3,2,20,56,5,5,0
1,1,1,1,0,0,1,2,3,0,6,...,2112,2,2,5,4,23,151,6,9,0
2,2,0,1,0,0,0,2,6,3,0,...,2218,2,5,2,13,15,105,4,11,0
3,3,2,0,0,0,0,2,0,3,3,...,2167,0,4,4,0,2,140,2,5,0
4,4,0,2,0,1,0,2,5,3,2,...,1747,2,2,1,7,2,50,4,3,0


In [23]:
encoded_df_test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0,0,0,0,1,0,1,0,3,...,174,2194,2,4,0,5,20,147,2,11
1,600001,0,0,0,0,1,2,0,4,5,...,4,1105,0,4,1,13,13,140,1,10
2,600002,0,0,0,0,1,0,0,0,5,...,16,810,0,1,5,8,13,12,1,8
3,600003,1,0,0,0,0,2,1,0,2,...,107,994,0,1,3,12,1,0,0,8
4,600004,0,0,1,0,1,2,0,6,3,...,30,370,0,0,4,15,9,14,2,5


In [24]:
encoded_df.columns

Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month',
       'target'],
      dtype='object')

In [25]:
X=encoded_df.drop(["id","target"],axis=1)
Y=encoded_df["target"]

In [26]:
skf = StratifiedKFold(n_splits=10,shuffle=True)
skf.get_n_splits(X, Y)

for train_index, test_index in skf.split(X, Y):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

TRAIN: [     2      3      4 ... 599997 599998 599999] TEST: [     0      1     43 ... 599977 599979 599992]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [    22     24     35 ... 599961 599966 599995]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [     5     25     34 ... 599969 599983 599986]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [     4     33     44 ... 599978 599989 599990]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [    11     21     38 ... 599953 599958 599968]
TRAIN: [     0      1      2 ... 599996 599998 599999] TEST: [    27     39     45 ... 599982 599984 599997]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [     7      8      9 ... 599957 599964 599980]
TRAIN: [     0      1      2 ... 599996 599997 599999] TEST: [    13     18     32 ... 599987 599994 599998]
TRAIN: [     0      1      3 ... 599997 599998 599999] TEST: [     2     14     17 ... 599988 599993 599996]
TRAIN: [     0     

In [16]:
'''

from sklearn.metrics import confusion_matrix,accuracy_score
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(objective='binary', random_state=5)
skf = StratifiedKFold(n_splits=10,shuffle=True)
skf.get_n_splits(X, Y)
accuracy=[]

for train_index, test_index in skf.split(X, Y):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    #lgbm.fit(x_train,y_train)
    #pred=lgbm.predict(x_test)
    #zscore=accuracy_score(pred,y_test)
    #accuracy.append(score)
#print(accuracy)'''

TRAIN: [     0      1      2 ... 599996 599997 599998] TEST: [     9     11     43 ... 599986 599994 599999]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [     8     17     23 ... 599982 599984 599987]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [    13     21     35 ... 599980 599983 599992]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [    14     26     33 ... 599962 599965 599975]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [     3      4     16 ... 599946 599970 599979]
TRAIN: [     0      1      2 ... 599997 599998 599999] TEST: [     5     10     19 ... 599957 599963 599977]
TRAIN: [     0      1      2 ... 599994 599996 599999] TEST: [     7     12     15 ... 599995 599997 599998]
TRAIN: [     1      2      3 ... 599997 599998 599999] TEST: [     0      6     22 ... 599989 599990 599996]
TRAIN: [     0      2      3 ... 599997 599998 599999] TEST: [     1     29     39 ... 599947 599991 599993]
TRAIN: [     0     

In [None]:
'''import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

params = {
    'learning_rate': [0.003,.002,.001, 0.004,0.01],
    'num_leaves': [5,10,20],
    'reg_alpha': [0.1, 0.5],
    'min_data_in_leaf': [10,30, 50, 100, 300, 400],
    'lambda_l1': [0, 1, 1.5],
    'lambda_l2': [0, 1],
    'boosting_type':['gbdt'],
    'objective':['binary'],
    'metric':['binary_logloss']
    }

lgb_estimator = lgb.Dataset(X,label=Y)

gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=params)
lgb_model = gsearch.fit(X=X, y=Y)
'''

In [55]:
import lightgbm as lgb
params={}
params['learning_rate']= 0.162
params['boosting_type']='gbdt'
params['objective']='binary'
params['metric']='binary_logloss'
params['sub_feature']=0.5
params['num_leaves']= 10
params['min_data']=50
params['max_depth']=30
d_train = lgb.Dataset(x_train, label=y_train )
d_test =  lgb.Dataset( x_test, label= y_test)
model=lgb.train(params,d_train,500,
                valid_sets=d_test,verbose_eval=True,
                early_stopping_rounds=20)

[1]	valid_0's binary_logloss: 0.474955
Training until validation scores don't improve for 20 rounds
[2]	valid_0's binary_logloss: 0.469664
[3]	valid_0's binary_logloss: 0.465627
[4]	valid_0's binary_logloss: 0.46169
[5]	valid_0's binary_logloss: 0.458961
[6]	valid_0's binary_logloss: 0.456486
[7]	valid_0's binary_logloss: 0.453789
[8]	valid_0's binary_logloss: 0.452155
[9]	valid_0's binary_logloss: 0.450825
[10]	valid_0's binary_logloss: 0.449555
[11]	valid_0's binary_logloss: 0.448318
[12]	valid_0's binary_logloss: 0.446684
[13]	valid_0's binary_logloss: 0.444777
[14]	valid_0's binary_logloss: 0.443619
[15]	valid_0's binary_logloss: 0.442892
[16]	valid_0's binary_logloss: 0.442007
[17]	valid_0's binary_logloss: 0.440413
[18]	valid_0's binary_logloss: 0.439181
[19]	valid_0's binary_logloss: 0.438149
[20]	valid_0's binary_logloss: 0.437485
[21]	valid_0's binary_logloss: 0.436801
[22]	valid_0's binary_logloss: 0.435763
[23]	valid_0's binary_logloss: 0.434893
[24]	valid_0's binary_logloss

[204]	valid_0's binary_logloss: 0.409376
[205]	valid_0's binary_logloss: 0.409373
[206]	valid_0's binary_logloss: 0.409349
[207]	valid_0's binary_logloss: 0.409337
[208]	valid_0's binary_logloss: 0.409339
[209]	valid_0's binary_logloss: 0.409338
[210]	valid_0's binary_logloss: 0.409331
[211]	valid_0's binary_logloss: 0.409294
[212]	valid_0's binary_logloss: 0.409261
[213]	valid_0's binary_logloss: 0.409257
[214]	valid_0's binary_logloss: 0.409253
[215]	valid_0's binary_logloss: 0.409258
[216]	valid_0's binary_logloss: 0.409109
[217]	valid_0's binary_logloss: 0.409012
[218]	valid_0's binary_logloss: 0.408923
[219]	valid_0's binary_logloss: 0.408909
[220]	valid_0's binary_logloss: 0.408826
[221]	valid_0's binary_logloss: 0.408782
[222]	valid_0's binary_logloss: 0.408769
[223]	valid_0's binary_logloss: 0.408683
[224]	valid_0's binary_logloss: 0.408698
[225]	valid_0's binary_logloss: 0.408686
[226]	valid_0's binary_logloss: 0.40868
[227]	valid_0's binary_logloss: 0.408641
[228]	valid_0's b

[406]	valid_0's binary_logloss: 0.405743
[407]	valid_0's binary_logloss: 0.405738
[408]	valid_0's binary_logloss: 0.405748
[409]	valid_0's binary_logloss: 0.405731
[410]	valid_0's binary_logloss: 0.405735
[411]	valid_0's binary_logloss: 0.405698
[412]	valid_0's binary_logloss: 0.405688
[413]	valid_0's binary_logloss: 0.4057
[414]	valid_0's binary_logloss: 0.405699
[415]	valid_0's binary_logloss: 0.405663
[416]	valid_0's binary_logloss: 0.405674
[417]	valid_0's binary_logloss: 0.405672
[418]	valid_0's binary_logloss: 0.405671
[419]	valid_0's binary_logloss: 0.405639
[420]	valid_0's binary_logloss: 0.405576
[421]	valid_0's binary_logloss: 0.405581
[422]	valid_0's binary_logloss: 0.405561
[423]	valid_0's binary_logloss: 0.405556
[424]	valid_0's binary_logloss: 0.405544
[425]	valid_0's binary_logloss: 0.405531
[426]	valid_0's binary_logloss: 0.405526
[427]	valid_0's binary_logloss: 0.405523
[428]	valid_0's binary_logloss: 0.405522
[429]	valid_0's binary_logloss: 0.405519
[430]	valid_0's bi

In [59]:
params={}
params['learning_rate']= 0.15
params['boosting_type']='gbdt'
params['objective']='binary'
params['metric']='binary_logloss'
params['sub_feature']=0.5
params['num_leaves']= 10
params['min_data']=50
params['max_depth']=10
d_train = lgb.Dataset(x_train, label=y_train )
d_test =  lgb.Dataset( x_test, label= y_test)
model2=lgb.train(params,d_train,400,
                valid_sets=d_test,verbose_eval=True,
                early_stopping_rounds=20)

[1]	valid_0's binary_logloss: 0.475444
[2]	valid_0's binary_logloss: 0.470444
[3]	valid_0's binary_logloss: 0.466851
[4]	valid_0's binary_logloss: 0.463001




[5]	valid_0's binary_logloss: 0.46034
[6]	valid_0's binary_logloss: 0.457675
[7]	valid_0's binary_logloss: 0.455067




[8]	valid_0's binary_logloss: 0.456381




[9]	valid_0's binary_logloss: 0.454925
[10]	valid_0's binary_logloss: 0.453575
[11]	valid_0's binary_logloss: 0.452183




[12]	valid_0's binary_logloss: 0.452572
[13]	valid_0's binary_logloss: 0.450861
[14]	valid_0's binary_logloss: 0.449846
[15]	valid_0's binary_logloss: 0.448876




[16]	valid_0's binary_logloss: 0.448002
[17]	valid_0's binary_logloss: 0.446015
[18]	valid_0's binary_logloss: 0.444453




[19]	valid_0's binary_logloss: 0.443003
[20]	valid_0's binary_logloss: 0.442081
[21]	valid_0's binary_logloss: 0.44243




[22]	valid_0's binary_logloss: 0.441203
[23]	valid_0's binary_logloss: 0.440238
[24]	valid_0's binary_logloss: 0.439281




[25]	valid_0's binary_logloss: 0.438717
[26]	valid_0's binary_logloss: 0.437891
[27]	valid_0's binary_logloss: 0.437064




[28]	valid_0's binary_logloss: 0.437264




[29]	valid_0's binary_logloss: 0.436515
[30]	valid_0's binary_logloss: 0.435799
[31]	valid_0's binary_logloss: 0.435731




[32]	valid_0's binary_logloss: 0.435221
[33]	valid_0's binary_logloss: 0.434356
[34]	valid_0's binary_logloss: 0.433828




[35]	valid_0's binary_logloss: 0.43422




[36]	valid_0's binary_logloss: 0.43479
[37]	valid_0's binary_logloss: 0.434298
[38]	valid_0's binary_logloss: 0.43376
[39]	valid_0's binary_logloss: 0.433051




[40]	valid_0's binary_logloss: 0.432898




[41]	valid_0's binary_logloss: 0.433283
[42]	valid_0's binary_logloss: 0.432736




[43]	valid_0's binary_logloss: 0.433019
[44]	valid_0's binary_logloss: 0.432369
[45]	valid_0's binary_logloss: 0.431889




[46]	valid_0's binary_logloss: 0.434396
[47]	valid_0's binary_logloss: 0.43335
[48]	valid_0's binary_logloss: 0.433168




[49]	valid_0's binary_logloss: 0.433448




[50]	valid_0's binary_logloss: 0.433997
[51]	valid_0's binary_logloss: 0.433214
[52]	valid_0's binary_logloss: 0.432467




[53]	valid_0's binary_logloss: 0.432834
[54]	valid_0's binary_logloss: 0.431963
[55]	valid_0's binary_logloss: 0.431367




[56]	valid_0's binary_logloss: 0.431749
[57]	valid_0's binary_logloss: 0.431334




[58]	valid_0's binary_logloss: 0.435066




[59]	valid_0's binary_logloss: 0.43535
[60]	valid_0's binary_logloss: 0.434028




[61]	valid_0's binary_logloss: 0.439565
[62]	valid_0's binary_logloss: 0.437228
[63]	valid_0's binary_logloss: 0.435408




[64]	valid_0's binary_logloss: 0.435606




[65]	valid_0's binary_logloss: 0.435985
[66]	valid_0's binary_logloss: 0.434428
[67]	valid_0's binary_logloss: 0.43306




[68]	valid_0's binary_logloss: 0.432014




[69]	valid_0's binary_logloss: 0.4323




[70]	valid_0's binary_logloss: 0.432461
[71]	valid_0's binary_logloss: 0.432559




[72]	valid_0's binary_logloss: 0.43186
[73]	valid_0's binary_logloss: 0.431125




[74]	valid_0's binary_logloss: 0.431378
[75]	valid_0's binary_logloss: 0.43085




[76]	valid_0's binary_logloss: 0.430811




[77]	valid_0's binary_logloss: 0.431064




[78]	valid_0's binary_logloss: 0.431391
[79]	valid_0's binary_logloss: 0.430727
[80]	valid_0's binary_logloss: 0.430198




[81]	valid_0's binary_logloss: 0.430495
[82]	valid_0's binary_logloss: 0.429868




[83]	valid_0's binary_logloss: 0.43019




[84]	valid_0's binary_logloss: 0.430414




[85]	valid_0's binary_logloss: 0.430763
[86]	valid_0's binary_logloss: 0.430325
[87]	valid_0's binary_logloss: 0.429893




[88]	valid_0's binary_logloss: 0.430232




[89]	valid_0's binary_logloss: 0.430487




[90]	valid_0's binary_logloss: 0.430779




[91]	valid_0's binary_logloss: 0.433197
[92]	valid_0's binary_logloss: 0.432228
[93]	valid_0's binary_logloss: 0.43128




[94]	valid_0's binary_logloss: 0.431726




[95]	valid_0's binary_logloss: 0.431961




[96]	valid_0's binary_logloss: 0.432275
[97]	valid_0's binary_logloss: 0.431408




[98]	valid_0's binary_logloss: 0.431691
[99]	valid_0's binary_logloss: 0.431025
[100]	valid_0's binary_logloss: 0.430374




[101]	valid_0's binary_logloss: 0.431686




[102]	valid_0's binary_logloss: 0.432017




[103]	valid_0's binary_logloss: 0.432406
[104]	valid_0's binary_logloss: 0.431411




[105]	valid_0's binary_logloss: 0.432897




[106]	valid_0's binary_logloss: 0.43322




[107]	valid_0's binary_logloss: 0.433642
[108]	valid_0's binary_logloss: 0.432496




[109]	valid_0's binary_logloss: 0.432816




[110]	valid_0's binary_logloss: 0.433141
[111]	valid_0's binary_logloss: 0.432076
[112]	valid_0's binary_logloss: 0.431093
[113]	valid_0's binary_logloss: 0.430428




[114]	valid_0's binary_logloss: 0.429851
[115]	valid_0's binary_logloss: 0.429381
[116]	valid_0's binary_logloss: 0.428835




[117]	valid_0's binary_logloss: 0.429066
[118]	valid_0's binary_logloss: 0.428518




[119]	valid_0's binary_logloss: 0.428737




[120]	valid_0's binary_logloss: 0.429052




[121]	valid_0's binary_logloss: 0.42936




[122]	valid_0's binary_logloss: 0.429692
[123]	valid_0's binary_logloss: 0.429169
[124]	valid_0's binary_logloss: 0.428692




[125]	valid_0's binary_logloss: 0.428946




[126]	valid_0's binary_logloss: 0.4292
[127]	valid_0's binary_logloss: 0.428578




[128]	valid_0's binary_logloss: 0.428834
[129]	valid_0's binary_logloss: 0.428365
[130]	valid_0's binary_logloss: 0.427949




[131]	valid_0's binary_logloss: 0.42815
[132]	valid_0's binary_logloss: 0.427717
[133]	valid_0's binary_logloss: 0.427427




[134]	valid_0's binary_logloss: 0.42806
[135]	valid_0's binary_logloss: 0.427675




[136]	valid_0's binary_logloss: 0.428747




[137]	valid_0's binary_logloss: 0.429024
[138]	valid_0's binary_logloss: 0.428377
[139]	valid_0's binary_logloss: 0.427842




[140]	valid_0's binary_logloss: 0.428128




[141]	valid_0's binary_logloss: 0.42837




[142]	valid_0's binary_logloss: 0.42862
[143]	valid_0's binary_logloss: 0.428116




[144]	valid_0's binary_logloss: 0.428336
[145]	valid_0's binary_logloss: 0.427694




[146]	valid_0's binary_logloss: 0.428015




[147]	valid_0's binary_logloss: 0.428284




[148]	valid_0's binary_logloss: 0.428581
[149]	valid_0's binary_logloss: 0.428112
[150]	valid_0's binary_logloss: 0.427579




[151]	valid_0's binary_logloss: 0.428195




[152]	valid_0's binary_logloss: 0.428956
[153]	valid_0's binary_logloss: 0.428301




[154]	valid_0's binary_logloss: 0.428514
[155]	valid_0's binary_logloss: 0.427988




KeyboardInterrupt: 

In [60]:
#encoded_df_test=encoded_df_test.drop(['id'],axis=1)
y_pred=model.predict(encoded_df_test)

In [61]:
y_pred

array([0.23946035, 0.31583122, 0.13287889, ..., 0.35443531, 0.24174781,
       0.21503457])

In [62]:
for i in range(0,100):
    if (y_pred[i] >= 0.5):
        y_pred[i] = 1
    else:
        y_pred[i] =0
len(y_pred) 

400000

In [63]:
sample = pd.read_csv("sample_submission.csv")
sample.loc[:, "target"] = y_pred
sample.to_csv("submission_using_lgbm.csv", index=False)

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

gs.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501], # Updated from 'seed'
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

# Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

mdl.get_params().keys()

# Create the grid
grid = GridSearchCV(mdl, gridParams,
                    verbose=0,
                    cv=4,
                    n_jobs=2)
# Run the grid
grid.fit(allTrainData, allTrainLabels)

In [None]:
features = [col for col in df_train.columns if col not in ['target', 'ID_code']]
X_test = df_test[features].values
feature_importance_df = pd.DataFrame()
predictions = df_test[['ID_code']]

for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train, df_train['target'])):
    print("FOLD: ", fold, "TRAIN:", train_index, "TEST:", test_index)
    X_train, y_train = df_train.iloc[trn_idx][features], df_train.iloc[trn_idx]['target']
    X_valid, y_valid = df_train.iloc[val_idx][features], df_train.iloc[val_idx]['target']
    
    N = 5
    p_valid = 0
    yp = 0
    
    for i in range(N):
        
        trn_data = lgb.Dataset(X_train, label = y_train)
        val_data = lgb.Dataset(X_valid, label = y_valid)
        
        
        
        lgb_clf = lgb.train(lgb_params,
                   trn_data,
                   100000,
                   valid_sets = [trn_data, val_data],
                    verbose_eval = 5000,
                    early_stopping_rounds = 3000)
        
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
    
    
    #Get importance of the fold when predicting test set
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = lgb_clf.feature_importance()
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    predictions['fold{}'.format(fold+1)] = yp/N