## Summary

##### We will investigate the performance of the LightGBM library on this problem. LightGBM  is a gradient boosting framework that uses tree based learning algorithms. The following experiments will be performed:

##### 1) Test it's performance directly on the raw data
##### 2) Test it's performance once we have standardize features by removing the mean and scaling to unit variance (sklearn.preprocessing.StandardScaler)
##### 3) Test it's performance once we have transformed features by scaling each feature to a given range (0, 1) (sklearn.preprocessing.MinMaxScaler)
##### 4) Test performance using cross validation



### Visualising the data

#### What files are in the input folder:

In [1]:
# All imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import gc


In [2]:
import os 
print('No. of files: {}'.format(str(len(os.listdir("./input")))))
print(os.listdir("./input"))

No. of files: 3
['sample_submission.csv', 'test.csv', 'train.csv']


##### Train and Test files

In [31]:
#Loading Train and Test Data

df_train = pd.read_csv("./input/train.csv")
df_test = pd.read_csv("./input/test.csv")

print("{} observations and {} features in train set.".format(df_train.shape[0], df_train.shape[1]))
print("{} observations and {} features in test set.".format(df_test.shape[0], df_test.shape[1]))

200000 observations and 202 features in train set.
200000 observations and 201 features in test set.


In [32]:
df_train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [33]:
df_test.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


Let's now look at the target score 

In [6]:
df_train['target'].value_counts()

0    179902
1     20098
Name: target, dtype: int64

#### Predicting on RAW data

In [7]:
from sklearn.model_selection import train_test_split

df_train_target = df_train['target']
df_train_modified = df_train.iloc[:,2:]
df_test_modified = df_test.iloc[:,1:]

x_train, x_test, y_train, y_test = train_test_split(df_train_modified, df_train_target, random_state = 13, test_size = 0.3)

pd.DataFrame(data=x_train).head(3)

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
61092,11.0156,-1.7036,13.6113,10.7284,11.3013,3.5456,4.6526,11.0551,-2.4123,5.8285,...,-0.4699,3.7281,0.781,1.2648,23.2156,-1.2186,13.067,8.4942,21.0695,-22.0377
77702,4.5837,2.8545,8.2859,6.823,14.4487,2.0574,5.2793,14.4021,-4.9655,8.4677,...,6.7582,1.7876,3.298,4.559,14.3334,-2.0163,2.5545,8.2228,14.0187,1.9558
19517,9.2443,-7.7795,9.6477,8.1215,11.624,-9.6645,5.6095,21.3256,2.8393,7.1223,...,4.3627,11.8427,3.2404,14.2788,22.826,-2.9378,4.2337,9.6881,18.1063,-23.0591


In [8]:
pd.DataFrame(data=y_train).head(3)

Unnamed: 0,target
61092,0
77702,0
19517,0


In [9]:
pd.DataFrame(data=x_test).head(3)

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
55460,10.2572,2.2939,9.1147,7.9457,9.8134,-17.0926,5.0725,14.9654,2.8733,6.1608,...,12.4596,4.9281,2.5307,5.2673,19.8761,-1.1277,-7.2447,9.853,17.9966,-20.7829
191859,12.1454,2.6294,10.3763,8.2464,10.8403,-17.3967,6.669,17.1594,2.4456,9.453,...,9.1293,2.4748,1.5009,6.2691,13.4652,-1.4229,-8.6105,9.2023,17.3172,0.0352
46956,18.1069,1.3129,9.4579,3.3247,11.4447,-16.7164,6.0643,12.8521,2.2359,7.4008,...,0.2891,6.1889,1.2743,7.7563,22.3554,-1.7241,4.3412,8.4876,16.5669,-8.2945


In [10]:
pd.DataFrame(data=y_test).head(3)

Unnamed: 0,target
55460,0
191859,1
46956,0


In [11]:
# Helper function for making float values in interval [0,1] binary
def to_binary(x):
    return (0 if x < 0.5 else 1)

In [12]:
from lightgbm import LGBMRegressor, LGBMClassifier

LGB_params = {'num_leaves': 255,
         'objective': 'binary',
         'learning_rate': 0.02,
         'metric': 'rmse',
         'max_bin': 255,
         'num_iterations': 1000,
         'boosting': 'gbdt'}

LGB = LGBMClassifier(**LGB_params, nthread = 4, n_jobs = -1)

In [13]:
LGB.fit(x_train, y_train, verbose=True)



LGBMClassifier(boosting='gbdt', boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, importance_type='split', learning_rate=0.02,
        max_bin=255, max_depth=-1, metric='rmse', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, nthread=4, num_iterations=1000, num_leaves=255,
        objective='binary', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [14]:
y_pred = LGB.predict_proba(x_test) # for classifier
print(y_pred[:5])

[[0.99823299 0.00176701]
 [0.86700264 0.13299736]
 [0.99686009 0.00313991]
 [0.92157707 0.07842293]
 [0.52145702 0.47854298]]


In [15]:
from sklearn.metrics import roc_auc_score

roc_test = roc_auc_score(y_test, y_pred[:, 1:])
print(roc_test)

0.8897959976171909


In [16]:
prediction = LGB.predict_proba(df_test_modified)

prediction_LGB = [to_binary(x[0]) for x in prediction[:, 1:]]
print(sum(prediction_LGB))
submission = pd.read_csv('./input/sample_submission.csv')
submission['target'] = prediction_LGB
submission.to_csv('LGB_raw.csv', index=False)

2227


#### Standard Scaler

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(copy=True)
df_train_std = scaler.fit_transform(df_train.iloc[:,2:])
df_test_std = scaler.transform(df_test.iloc[:,1:])

In [25]:
x_train_std, x_test_std, y_train_std, y_test_std = train_test_split(df_train_std, df_train_target, random_state = 13, test_size = 0.3)


In [26]:
LGB_std = LGBMClassifier(**LGB_params, nthread = 4, n_jobs = -1)

In [27]:
LGB_std.fit(x_train_std, y_train_std, verbose=True)



LGBMClassifier(boosting='gbdt', boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, importance_type='split', learning_rate=0.02,
        max_bin=255, max_depth=-1, metric='rmse', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, nthread=4, num_iterations=1000, num_leaves=255,
        objective='binary', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [28]:
y_pred_std = LGB_std.predict_proba(x_test_std) # for classifier
print(y_pred_std[:5])

[[0.99646706 0.00353294]
 [0.87639094 0.12360906]
 [0.99481716 0.00518284]
 [0.9507472  0.0492528 ]
 [0.5391847  0.4608153 ]]


In [29]:
roc_test = roc_auc_score(y_test_std, y_pred_std[:, 1:])
print(roc_test)

0.8890192457068846


In [30]:
prediction_std = LGB_std.predict_proba(df_test_std)
print(len(prediction_std))

prediction_LGB_std = [to_binary(x[0]) for x in prediction_std[:, 1:]]
print(sum(prediction_LGB_std))
submission = pd.read_csv('./input/sample_submission.csv')
submission['target'] = prediction_LGB_std
submission.to_csv('LGB_std.csv', index=False)

200000
2228


#### MinMax Scaler

In [34]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
df_train_mm = scaler.fit_transform(df_train.iloc[:,2:])
df_test_mm = scaler.transform(df_test.iloc[:,1:])

In [35]:
x_train_mm, x_test_mm, y_train_mm, y_test_mm = train_test_split(df_train_mm, df_train_target, random_state = 13, test_size = 0.3)


In [36]:
LGB_mm = LGBMClassifier(**LGB_params, nthread = 4, n_jobs = -1)

In [37]:
LGB_mm.fit(x_train_mm, y_train_mm, verbose=True)



LGBMClassifier(boosting='gbdt', boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, importance_type='split', learning_rate=0.02,
        max_bin=255, max_depth=-1, metric='rmse', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, nthread=4, num_iterations=1000, num_leaves=255,
        objective='binary', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [38]:
y_pred_mm = LGB_mm.predict_proba(x_test_mm) # for classifier
print(y_pred_mm[:5])

[[0.9959884  0.0040116 ]
 [0.85263011 0.14736989]
 [0.99497228 0.00502772]
 [0.95011982 0.04988018]
 [0.55410479 0.44589521]]


In [39]:
roc_test = roc_auc_score(y_test_mm, y_pred_mm[:, 1:])
print(roc_test)

0.889490047770696


In [41]:
prediction_mm = LGB_mm.predict_proba(df_test_mm)
print(len(prediction_mm))

prediction_LGB_mm = [to_binary(x[0]) for x in prediction_mm[:, 1:]]
print(sum(prediction_LGB_mm))

200000
2229


#### LGB Cross validation

In [42]:
LGB_cross = LGBMClassifier(**LGB_params, nthread = 4, n_jobs = -1)

In [43]:
from sklearn.model_selection import cross_val_predict

y_pred_cross = cross_val_predict(LGB_cross, df_train.iloc[:,2:], df_train_target, cv=5, verbose=1000)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 17.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 25.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 33.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 42.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 42.0min finished


In [44]:
print(int(sum(y_pred_cross))) # submitted score of 0.499 CAT

3978


In [45]:
roc_test = roc_auc_score(df_train_target, y_pred_cross)
print(roc_test)

0.5842789979797706


In [46]:
submission = pd.read_csv('./input/sample_submission.csv')
submission['target'] = [to_binary(x) for x in y_pred_cross]
submission.to_csv('LGB_cross.csv', index=False)