In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import seaborn as sns

from mydatools.features_analyze import get_top_k_corr

% matplotlib inline

## Config

In [2]:
trn_path = './data/input/application_train.csv'
tst_path = './data/input/application_test.csv'
id_col = 'SK_ID_CURR'
label_col = 'TARGET'

submission_path = './data/output/submission/submission.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
trn_df = pd.read_csv(trn_path)
trn_df['ds_type'] = 'trn'
trn_df[label_col] = trn_df[label_col].astype('int')

tst_df = pd.read_csv(tst_path)
tst_df['ds_type'] = 'tst'

full_df = pd.concat([trn_df, tst_df])

del(trn_df)
del(tst_df)

full_df.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,ds_type
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0149,"Stone, brick",WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,trn
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0714,Block,MONDAY,0.9851,0.9851,0.9851,0.796,0.7987,0.804,trn
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,MONDAY,,,,,,,trn
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,,,WEDNESDAY,,,,,,,trn
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,THURSDAY,,,,,,,trn


In [4]:
# dataset type
is_trn = full_df['ds_type'] == 'trn'
is_tst = full_df['ds_type'] == 'tst'

## Features

In [5]:
feature_columns = []

def add_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns.extend([f for f in features if f not in feature_columns])

In [6]:
numerical_features = full_df.dtypes[full_df.dtypes != 'object'].index.tolist()
numerical_features = [c for c in numerical_features if c not in [id_col, label_col, 'ds_type']]
add_features(numerical_features)

## LightGBM

In [7]:
trn_df = full_df[is_trn]
tst_df = full_df[is_tst]

X = trn_df[feature_columns]
y = trn_df[label_col]
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

X_tst = tst_df[feature_columns]

d_trn = lgb.Dataset(X_trn.values, y_trn)
d_val = lgb.Dataset(X_val.values, y_val)

In [12]:
params = {
    'application': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'max_depth': 5,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'is_unbalance': True,
    'seed': 0,
    'reg_alpha': 0.1,
}
bst = lgb.train(params, d_trn, 1000, valid_sets=[d_trn, d_val], early_stopping_rounds=20)

[1]	training's auc: 0.708205	valid_1's auc: 0.704915
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.71707	valid_1's auc: 0.713655
[3]	training's auc: 0.720519	valid_1's auc: 0.716905
[4]	training's auc: 0.723894	valid_1's auc: 0.719428
[5]	training's auc: 0.724062	valid_1's auc: 0.719811
[6]	training's auc: 0.723832	valid_1's auc: 0.719697
[7]	training's auc: 0.724375	valid_1's auc: 0.720112
[8]	training's auc: 0.72528	valid_1's auc: 0.720403
[9]	training's auc: 0.725628	valid_1's auc: 0.720654
[10]	training's auc: 0.725683	valid_1's auc: 0.720979
[11]	training's auc: 0.725813	valid_1's auc: 0.721191
[12]	training's auc: 0.726163	valid_1's auc: 0.721036
[13]	training's auc: 0.726496	valid_1's auc: 0.721426
[14]	training's auc: 0.726452	valid_1's auc: 0.72174
[15]	training's auc: 0.726509	valid_1's auc: 0.722012
[16]	training's auc: 0.728027	valid_1's auc: 0.723263
[17]	training's auc: 0.727868	valid_1's auc: 0.723023
[18]	training's auc: 0.727851	va

[152]	training's auc: 0.741385	valid_1's auc: 0.733886
[153]	training's auc: 0.741424	valid_1's auc: 0.733915
[154]	training's auc: 0.741536	valid_1's auc: 0.734023
[155]	training's auc: 0.741564	valid_1's auc: 0.734057
[156]	training's auc: 0.741608	valid_1's auc: 0.734089
[157]	training's auc: 0.741689	valid_1's auc: 0.734147
[158]	training's auc: 0.741732	valid_1's auc: 0.73417
[159]	training's auc: 0.741796	valid_1's auc: 0.734238
[160]	training's auc: 0.74182	valid_1's auc: 0.73426
[161]	training's auc: 0.741962	valid_1's auc: 0.734345
[162]	training's auc: 0.742053	valid_1's auc: 0.734397
[163]	training's auc: 0.742077	valid_1's auc: 0.734401
[164]	training's auc: 0.742178	valid_1's auc: 0.734516
[165]	training's auc: 0.742235	valid_1's auc: 0.734539
[166]	training's auc: 0.74227	valid_1's auc: 0.73456
[167]	training's auc: 0.742297	valid_1's auc: 0.734562
[168]	training's auc: 0.742365	valid_1's auc: 0.734605
[169]	training's auc: 0.742481	valid_1's auc: 0.734675
[170]	training'

[303]	training's auc: 0.751265	valid_1's auc: 0.741197
[304]	training's auc: 0.751312	valid_1's auc: 0.74122
[305]	training's auc: 0.751373	valid_1's auc: 0.741259
[306]	training's auc: 0.75141	valid_1's auc: 0.741291
[307]	training's auc: 0.751451	valid_1's auc: 0.741298
[308]	training's auc: 0.751548	valid_1's auc: 0.741359
[309]	training's auc: 0.751616	valid_1's auc: 0.741413
[310]	training's auc: 0.751696	valid_1's auc: 0.741478
[311]	training's auc: 0.75176	valid_1's auc: 0.741518
[312]	training's auc: 0.751791	valid_1's auc: 0.741542
[313]	training's auc: 0.751833	valid_1's auc: 0.741549
[314]	training's auc: 0.751892	valid_1's auc: 0.74159
[315]	training's auc: 0.751977	valid_1's auc: 0.741649
[316]	training's auc: 0.75201	valid_1's auc: 0.741673
[317]	training's auc: 0.752076	valid_1's auc: 0.741727
[318]	training's auc: 0.752118	valid_1's auc: 0.741741
[319]	training's auc: 0.752165	valid_1's auc: 0.741774
[320]	training's auc: 0.752227	valid_1's auc: 0.74182
[321]	training's

[453]	training's auc: 0.758845	valid_1's auc: 0.74642
[454]	training's auc: 0.758877	valid_1's auc: 0.746428
[455]	training's auc: 0.758919	valid_1's auc: 0.746455
[456]	training's auc: 0.758975	valid_1's auc: 0.746502
[457]	training's auc: 0.759001	valid_1's auc: 0.746523
[458]	training's auc: 0.759046	valid_1's auc: 0.74656
[459]	training's auc: 0.75908	valid_1's auc: 0.746593
[460]	training's auc: 0.759127	valid_1's auc: 0.74662
[461]	training's auc: 0.759163	valid_1's auc: 0.746625
[462]	training's auc: 0.759209	valid_1's auc: 0.74664
[463]	training's auc: 0.759242	valid_1's auc: 0.746657
[464]	training's auc: 0.759281	valid_1's auc: 0.746675
[465]	training's auc: 0.759325	valid_1's auc: 0.746695
[466]	training's auc: 0.759362	valid_1's auc: 0.746723
[467]	training's auc: 0.759422	valid_1's auc: 0.746774
[468]	training's auc: 0.759495	valid_1's auc: 0.746839
[469]	training's auc: 0.759535	valid_1's auc: 0.74686
[470]	training's auc: 0.75956	valid_1's auc: 0.746872
[471]	training's 

[603]	training's auc: 0.764212	valid_1's auc: 0.749298
[604]	training's auc: 0.764243	valid_1's auc: 0.749316
[605]	training's auc: 0.764273	valid_1's auc: 0.749339
[606]	training's auc: 0.764303	valid_1's auc: 0.749368
[607]	training's auc: 0.764332	valid_1's auc: 0.749371
[608]	training's auc: 0.764358	valid_1's auc: 0.749387
[609]	training's auc: 0.764384	valid_1's auc: 0.749407
[610]	training's auc: 0.764404	valid_1's auc: 0.749407
[611]	training's auc: 0.764434	valid_1's auc: 0.749426
[612]	training's auc: 0.76445	valid_1's auc: 0.749429
[613]	training's auc: 0.764475	valid_1's auc: 0.749442
[614]	training's auc: 0.764508	valid_1's auc: 0.74946
[615]	training's auc: 0.764535	valid_1's auc: 0.749483
[616]	training's auc: 0.764571	valid_1's auc: 0.749501
[617]	training's auc: 0.764618	valid_1's auc: 0.749541
[618]	training's auc: 0.764652	valid_1's auc: 0.749566
[619]	training's auc: 0.764688	valid_1's auc: 0.74957
[620]	training's auc: 0.764705	valid_1's auc: 0.749566
[621]	trainin

[755]	training's auc: 0.768633	valid_1's auc: 0.751319
[756]	training's auc: 0.76865	valid_1's auc: 0.75133
[757]	training's auc: 0.768688	valid_1's auc: 0.751347
[758]	training's auc: 0.768721	valid_1's auc: 0.751354
[759]	training's auc: 0.768754	valid_1's auc: 0.751358
[760]	training's auc: 0.768775	valid_1's auc: 0.751353
[761]	training's auc: 0.768809	valid_1's auc: 0.751364
[762]	training's auc: 0.768835	valid_1's auc: 0.751366
[763]	training's auc: 0.768843	valid_1's auc: 0.751372
[764]	training's auc: 0.768875	valid_1's auc: 0.751395
[765]	training's auc: 0.768896	valid_1's auc: 0.751401
[766]	training's auc: 0.768931	valid_1's auc: 0.751426
[767]	training's auc: 0.768948	valid_1's auc: 0.751427
[768]	training's auc: 0.768978	valid_1's auc: 0.751428
[769]	training's auc: 0.769015	valid_1's auc: 0.751443
[770]	training's auc: 0.769036	valid_1's auc: 0.751449
[771]	training's auc: 0.769074	valid_1's auc: 0.751455
[772]	training's auc: 0.769102	valid_1's auc: 0.751464
[773]	traini

[906]	training's auc: 0.772435	valid_1's auc: 0.752447
[907]	training's auc: 0.772456	valid_1's auc: 0.752451
[908]	training's auc: 0.772476	valid_1's auc: 0.752448
[909]	training's auc: 0.772509	valid_1's auc: 0.752457
[910]	training's auc: 0.772526	valid_1's auc: 0.752465
[911]	training's auc: 0.772559	valid_1's auc: 0.752491
[912]	training's auc: 0.772583	valid_1's auc: 0.752502
[913]	training's auc: 0.772593	valid_1's auc: 0.752498
[914]	training's auc: 0.772618	valid_1's auc: 0.752489
[915]	training's auc: 0.772645	valid_1's auc: 0.752477
[916]	training's auc: 0.772671	valid_1's auc: 0.75248
[917]	training's auc: 0.77269	valid_1's auc: 0.752482
[918]	training's auc: 0.772722	valid_1's auc: 0.75249
[919]	training's auc: 0.772748	valid_1's auc: 0.752493
[920]	training's auc: 0.772775	valid_1's auc: 0.752492
[921]	training's auc: 0.772797	valid_1's auc: 0.7525
[922]	training's auc: 0.772818	valid_1's auc: 0.752507
[923]	training's auc: 0.772832	valid_1's auc: 0.75252
[924]	training's

In [9]:
# bst = lgb.train(params, trn_lgb, 1000, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20,
#                 init_model=bst, 
#                 learning_rates=lambda iter: 0.1 * (0.99 ** iter))

In [10]:
# imp_df = pd.DataFrame([bst.feature_importance()], columns=feature_columns, index=['importance']).T.sort_values(by='importance', ascending=False)
# imp_df

## predict

In [11]:
res_df = pd.DataFrame(bst.predict(X_tst), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].values
res_df[[output_id_col, output_label_col]].to_csv(submission_path, index=False)