# Step 0.1. Import necessary libraries 

In [1]:
# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

# Imports from our package
import lightgbm as lgb

# Step 0.2. Parameters 

In [2]:

TARGET_NAME = 'target'

# Step 0.3. Data load 

In [3]:
%%time

train_data = pd.read_csv('./input/train.csv')
train_data[TARGET_NAME] = train_data[TARGET_NAME].str.slice(start=6).astype(int) - 1
train_data.head()

CPU times: user 549 ms, sys: 110 ms, total: 660 ms
Wall time: 663 ms


Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,...,0,0,0,0,0,0,2,0,0,5
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,5
2,2,0,0,0,0,0,1,0,3,0,...,0,0,0,0,1,0,0,0,0,1
3,3,0,0,7,0,1,5,2,2,0,...,0,4,0,2,2,0,4,3,0,7
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
test_data = pd.read_csv('./input/test.csv')
test_data.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74
0,200000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,200001,1,2,0,0,0,0,0,0,0,...,3,1,3,0,0,0,0,3,0,0
2,200002,0,1,7,1,0,0,0,0,6,...,3,0,0,0,0,3,0,2,0,0
3,200003,0,0,0,4,3,1,0,0,0,...,0,0,0,1,0,0,0,4,0,0
4,200004,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
submission = pd.read_csv('./input/sample_submission.csv')
submission.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,200000,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
1,200001,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
2,200002,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
3,200003,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
4,200004,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111


# Step 0.5. Add new features

In [10]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_selection import SelectKBest

In [11]:
remover = ['id', TARGET_NAME]

X = train_data.drop(remover, axis=1)
y = train_data[TARGET_NAME]

Xtrain, Xval, ytrain, yval = train_test_split(X,y, test_size=0.5, random_state=0)

modelo = lgb.LGBMClassifier(n_estimators=100, 
                            learning_rate=0.05,
                            max_depth=6,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            random_state=9, n_jobs=6)
modelo.fit(Xtrain, ytrain)
    
    
res = dict()
for col in X.columns:
    Xval_ = Xval.copy()
    Xval_[col] = np.random.permutation(Xval_[col])
    
    p = modelo.predict_proba(Xval_)
    ll = log_loss(yval, p)
    res[col] = ll
    print("Feature: {} | LL: {:.4f} | LL Delta: {:.4f}".format(col, ll, 1.7527 - ll))


Feature: feature_0 | LL: 1.7528 | LL Delta: -0.0001
Feature: feature_1 | LL: 1.7536 | LL Delta: -0.0009
Feature: feature_2 | LL: 1.7548 | LL Delta: -0.0021
Feature: feature_3 | LL: 1.7545 | LL Delta: -0.0018
Feature: feature_4 | LL: 1.7533 | LL Delta: -0.0006
Feature: feature_5 | LL: 1.7543 | LL Delta: -0.0016
Feature: feature_6 | LL: 1.7529 | LL Delta: -0.0002
Feature: feature_7 | LL: 1.7530 | LL Delta: -0.0003
Feature: feature_8 | LL: 1.7544 | LL Delta: -0.0017
Feature: feature_9 | LL: 1.7530 | LL Delta: -0.0003
Feature: feature_10 | LL: 1.7532 | LL Delta: -0.0005
Feature: feature_11 | LL: 1.7537 | LL Delta: -0.0010
Feature: feature_12 | LL: 1.7573 | LL Delta: -0.0046
Feature: feature_13 | LL: 1.7537 | LL Delta: -0.0010
Feature: feature_14 | LL: 1.7549 | LL Delta: -0.0022
Feature: feature_15 | LL: 1.7527 | LL Delta: -0.0000
Feature: feature_16 | LL: 1.7550 | LL Delta: -0.0023
Feature: feature_17 | LL: 1.7531 | LL Delta: -0.0004
Feature: feature_18 | LL: 1.7568 | LL Delta: -0.0041
Fea

In [19]:
fimp_perm = (1.7527 - pd.Series(res)).sort_values() * -1

In [20]:
fimp_lgbm = pd.Series(modelo.feature_importances_, index=X.columns).sort_values(ascending=False)

In [24]:
fimps = pd.DataFrame({"fimp_perm": fimp_perm, "fimp_lgbm": fimp_lgbm})#.corr()
fimps.corr()

Unnamed: 0,fimp_perm,fimp_lgbm
fimp_perm,1.0,0.663498
fimp_lgbm,0.663498,1.0


In [30]:
fimps_rank = fimps.rank(ascending=False).sort_values("fimp_perm")

In [34]:
fimps_rank.mean(axis=1).sort_values().head(20)

feature_43     2.0
feature_12     3.5
feature_20     6.5
feature_37     9.0
feature_31     9.5
feature_14     9.5
feature_53    10.0
feature_18    10.5
feature_69    11.0
feature_54    11.5
feature_72    12.0
feature_40    12.0
feature_28    14.0
feature_56    14.5
feature_3     15.0
feature_65    15.5
feature_11    17.5
feature_39    18.0
feature_2     18.5
feature_19    19.0
dtype: float64

## Step 4. Predict for test data and check OOF score

In [39]:
%%time

Xtest = test_data.drop(['id'], axis=1)
test_pred = modelo.predict_proba(Xtest)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

Prediction for test data:
[[0.04944687 0.38136467 0.15238992 0.02804802 0.01625585 0.15706701
  0.02957129 0.06453733 0.12131903]
 [0.04182829 0.11194847 0.07974603 0.01921171 0.01301967 0.24063403
  0.06796051 0.2914109  0.13424039]
 [0.03135934 0.0365659  0.02916993 0.01495227 0.00930413 0.64966175
  0.03791206 0.12582645 0.06524817]
 [0.05681054 0.09477449 0.07133308 0.06759163 0.01834014 0.22120138
  0.06755789 0.23761375 0.1647771 ]
 [0.04811521 0.10235665 0.06775348 0.02752761 0.01535444 0.30512659
  0.05423843 0.23162949 0.14789811]
 [0.0538255  0.18572355 0.10548986 0.03079459 0.00851223 0.28509499
  0.0486566  0.14618253 0.13572016]
 [0.04051454 0.10155454 0.09628989 0.02700698 0.01734629 0.20785543
  0.0950434  0.24772725 0.16666167]
 [0.05283235 0.42581148 0.15739456 0.02702927 0.01775131 0.0528065
  0.04543437 0.08300422 0.13793593]
 [0.0385435  0.05666541 0.04116004 0.01918719 0.01874088 0.36288727
  0.0772345  0.26663423 0.11894698]
 [0.05821159 0.04823691 0.02245061 0.01

## Step 5. Prepare submission

In [38]:
test_data.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74
0,200000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,200001,1,2,0,0,0,0,0,0,0,...,3,1,3,0,0,0,0,3,0,0
2,200002,0,1,7,1,0,0,0,0,6,...,3,0,0,0,0,3,0,2,0,0
3,200003,0,0,0,4,3,1,0,0,0,...,0,0,0,1,0,0,0,4,0,0
4,200004,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [40]:
submission.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,200000,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
1,200001,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
2,200002,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
3,200003,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
4,200004,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111


In [41]:
test_pred

array([[0.04944687, 0.38136467, 0.15238992, ..., 0.02957129, 0.06453733,
        0.12131903],
       [0.04182829, 0.11194847, 0.07974603, ..., 0.06796051, 0.2914109 ,
        0.13424039],
       [0.03135934, 0.0365659 , 0.02916993, ..., 0.03791206, 0.12582645,
        0.06524817],
       ...,
       [0.06148192, 0.25823243, 0.14251135, ..., 0.04202726, 0.13513495,
        0.15825685],
       [0.03269746, 0.02734209, 0.02067823, ..., 0.08398331, 0.28918354,
        0.07123829],
       [0.04632209, 0.06821666, 0.06314789, ..., 0.07466514, 0.22520005,
        0.13653752]])

In [43]:
test_pred.shape

(100000, 9)

In [42]:
submission.iloc[:, 1:].shape

(100000, 9)

In [45]:
submission.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,200000,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
1,200001,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
2,200002,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
3,200003,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
4,200004,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111


In [46]:
submission.iloc[:, 1:] = test_pred
submission.to_csv('submission_super_incroivel_do_tutorial.csv', index = False)

In [48]:
!head submission_super_incroivel_do_tutorial.csv

id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
200000,0.04944686961304062,0.38136466906092553,0.15238992178673308,0.028048016514760587,0.01625585193140605,0.15706701456213734,0.0295712941421512,0.0645373348346226,0.121319027554223
200001,0.041828287561369325,0.11194846947595687,0.07974602831527923,0.019211712924054478,0.013019667170085437,0.24063403433644362,0.06796050525604683,0.2914109023528768,0.13424039260788734
200002,0.03135934176896103,0.03656589869578677,0.029169928360088564,0.014952270039455592,0.009304128917526633,0.649661751769381,0.037912062007045666,0.12582644725573613,0.06524817118601862
200003,0.05681054120915981,0.09477448599222824,0.07133308474882201,0.06759163164084846,0.01834014013680466,0.22120137563841458,0.0675578904137499,0.23761374629707563,0.16477710392289674
200004,0.048115207200992625,0.10235664874319124,0.06775347826686222,0.027527606568708257,0.015354439510899483,0.30512659178434404,0.05423843184877615,0.2316294891416547,0.147898

In [47]:
submission

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,200000,0.049447,0.381365,0.152390,0.028048,0.016256,0.157067,0.029571,0.064537,0.121319
1,200001,0.041828,0.111948,0.079746,0.019212,0.013020,0.240634,0.067961,0.291411,0.134240
2,200002,0.031359,0.036566,0.029170,0.014952,0.009304,0.649662,0.037912,0.125826,0.065248
3,200003,0.056811,0.094774,0.071333,0.067592,0.018340,0.221201,0.067558,0.237614,0.164777
4,200004,0.048115,0.102357,0.067753,0.027528,0.015354,0.305127,0.054238,0.231629,0.147898
...,...,...,...,...,...,...,...,...,...,...
99995,299995,0.054736,0.405107,0.144628,0.024558,0.014770,0.105107,0.040471,0.089994,0.120630
99996,299996,0.054184,0.265778,0.131321,0.026013,0.014491,0.143454,0.047156,0.163771,0.153832
99997,299997,0.061482,0.258232,0.142511,0.030974,0.015191,0.156190,0.042027,0.135135,0.158257
99998,299998,0.032697,0.027342,0.020678,0.012697,0.012896,0.449285,0.083983,0.289184,0.071238
