In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import seaborn as sns

from mydatools.plot import plot_multiclass_feature_dist
from mydatools.features_analyze import get_top_k_corr

% matplotlib inline

## Config

In [2]:
trn_path = './data/input/application_train.csv'
tst_path = './data/input/application_test.csv'
id_col = 'SK_ID_CURR'
label_col = 'TARGET'

submission_path = './data/output/submission/submission.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
trn_df = pd.read_csv(trn_path)
trn_df['ds_type'] = 'trn'
trn_df[label_col] = trn_df[label_col].astype('int')

tst_df = pd.read_csv(tst_path)
tst_df['ds_type'] = 'tst'

full_df = pd.concat([trn_df, tst_df])

del(trn_df)
del(tst_df)

full_df.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,ds_type
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0149,"Stone, brick",WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,trn
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0714,Block,MONDAY,0.9851,0.9851,0.9851,0.796,0.7987,0.804,trn
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,MONDAY,,,,,,,trn
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,,,WEDNESDAY,,,,,,,trn
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,THURSDAY,,,,,,,trn


In [4]:
# dataset type
is_trn = full_df['ds_type'] == 'trn'
is_tst = full_df['ds_type'] == 'tst'

## Features

In [5]:
feature_columns = []

def add_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns.extend([f for f in features if f not in feature_columns])

**numerical_features**

In [6]:
numerical_features = full_df.dtypes[full_df.dtypes != 'object'].index.tolist()
numerical_features = [c for c in numerical_features if c not in [id_col, label_col, 'ds_type']]
add_features(numerical_features)

**categorical_features**

In [7]:
categorical_features = full_df.dtypes[full_df.dtypes == 'object'].index.tolist()
full_df[categorical_features].nunique()

CODE_GENDER                    3
EMERGENCYSTATE_MODE            2
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
NAME_CONTRACT_TYPE             2
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
NAME_INCOME_TYPE               8
NAME_TYPE_SUITE                7
OCCUPATION_TYPE               18
ORGANIZATION_TYPE             58
WALLSMATERIAL_MODE             7
WEEKDAY_APPR_PROCESS_START     7
ds_type                        2
dtype: int64

In [8]:
categorical_features = full_df.dtypes[full_df.dtypes == 'object'].index.tolist()
categorical_features = categorical_features[:-1] # 为了去掉ds_type
# 去掉分类数太多
# categorical_features = full_df[categorical_features].columns[full_df[categorical_features].nunique() <= 10]

# get dummies
full_df = pd.get_dummies(full_df, columns=categorical_features, prefix_sep=':')

# add features
new_features = full_df.columns.tolist()[full_df.columns.tolist().index('ds_type') + 1:]
add_features(new_features)

**bureau.csv**

In [9]:
# 使用bureau每个指标的平均值作为新指标，分类特征先进行get_dummies再求平均值
bureau = pd.read_csv('./data/input/bureau.csv')

bureau = pd.get_dummies(bureau, columns=bureau.columns[bureau.dtypes == 'object'].tolist())
bureau = bureau.fillna(0)

bureau = bureau.groupby('SK_ID_CURR').mean()

bureau.columns = ['bureau_'+f for f in bureau.columns.tolist()]

In [10]:
full_df = full_df.join(bureau, on='SK_ID_CURR')
add_features(bureau.columns.tolist())

**previous_application.csv**

In [11]:
prevapp = pd.read_csv('./data/input/previous_application.csv')
prevapp = prevapp.drop('SK_ID_PREV', axis=1)

prevapp = pd.get_dummies(prevapp, columns=prevapp.columns[prevapp.dtypes == 'object'].tolist())
prevapp = prevapp.fillna(0)

prevapp = prevapp.groupby('SK_ID_CURR').mean()

prevapp.columns = ['prevapp_'+f for f in prevapp.columns.tolist()]

In [12]:
full_df = full_df.join(prevapp, on='SK_ID_CURR')
add_features(prevapp.columns.tolist())

## LightGBM

In [13]:
trn_df = full_df[is_trn]
tst_df = full_df[is_tst]

X = trn_df[feature_columns]
y = trn_df[label_col]
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.20, random_state=20180521)

X_tst = tst_df[feature_columns]

d_trn = lgb.Dataset(X_trn.values, y_trn)
d_val = lgb.Dataset(X_val.values, y_val)

In [14]:
params = {
    'application': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_depth': 4,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'is_unbalance': True,
    'seed': 0,
    'reg_alpha': 0.1,
}
bst = lgb.train(params, d_trn, 500, valid_sets=[d_trn, d_val], early_stopping_rounds=20)

[1]	training's auc: 0.702702	valid_1's auc: 0.704245
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.710823	valid_1's auc: 0.713096
[3]	training's auc: 0.715576	valid_1's auc: 0.718471
[4]	training's auc: 0.719136	valid_1's auc: 0.723503
[5]	training's auc: 0.72087	valid_1's auc: 0.725025
[6]	training's auc: 0.721555	valid_1's auc: 0.726324
[7]	training's auc: 0.722511	valid_1's auc: 0.726977
[8]	training's auc: 0.723141	valid_1's auc: 0.727561
[9]	training's auc: 0.723425	valid_1's auc: 0.727993
[10]	training's auc: 0.725537	valid_1's auc: 0.730479
[11]	training's auc: 0.727759	valid_1's auc: 0.732152
[12]	training's auc: 0.728165	valid_1's auc: 0.732307
[13]	training's auc: 0.729559	valid_1's auc: 0.733431
[14]	training's auc: 0.72998	valid_1's auc: 0.733994
[15]	training's auc: 0.73117	valid_1's auc: 0.735118
[16]	training's auc: 0.731293	valid_1's auc: 0.735272
[17]	training's auc: 0.731306	valid_1's auc: 0.735081
[18]	training's auc: 0.731535	va

[153]	training's auc: 0.780543	valid_1's auc: 0.772016
[154]	training's auc: 0.780693	valid_1's auc: 0.772123
[155]	training's auc: 0.780746	valid_1's auc: 0.772136
[156]	training's auc: 0.780836	valid_1's auc: 0.772197
[157]	training's auc: 0.780966	valid_1's auc: 0.772279
[158]	training's auc: 0.781122	valid_1's auc: 0.772224
[159]	training's auc: 0.781194	valid_1's auc: 0.772191
[160]	training's auc: 0.781336	valid_1's auc: 0.772185
[161]	training's auc: 0.781469	valid_1's auc: 0.772249
[162]	training's auc: 0.781624	valid_1's auc: 0.772339
[163]	training's auc: 0.781737	valid_1's auc: 0.772394
[164]	training's auc: 0.781875	valid_1's auc: 0.772441
[165]	training's auc: 0.782054	valid_1's auc: 0.772503
[166]	training's auc: 0.782226	valid_1's auc: 0.772647
[167]	training's auc: 0.782329	valid_1's auc: 0.772667
[168]	training's auc: 0.782478	valid_1's auc: 0.772611
[169]	training's auc: 0.78261	valid_1's auc: 0.772676
[170]	training's auc: 0.782791	valid_1's auc: 0.772737
[171]	train

[303]	training's auc: 0.798341	valid_1's auc: 0.777607
[304]	training's auc: 0.798435	valid_1's auc: 0.777647
[305]	training's auc: 0.798537	valid_1's auc: 0.777699
[306]	training's auc: 0.798619	valid_1's auc: 0.777726
[307]	training's auc: 0.798704	valid_1's auc: 0.777743
[308]	training's auc: 0.798831	valid_1's auc: 0.777791
[309]	training's auc: 0.798933	valid_1's auc: 0.777789
[310]	training's auc: 0.799018	valid_1's auc: 0.777751
[311]	training's auc: 0.799106	valid_1's auc: 0.777766
[312]	training's auc: 0.799232	valid_1's auc: 0.777763
[313]	training's auc: 0.799341	valid_1's auc: 0.777804
[314]	training's auc: 0.799421	valid_1's auc: 0.777848
[315]	training's auc: 0.799527	valid_1's auc: 0.777913
[316]	training's auc: 0.799594	valid_1's auc: 0.777891
[317]	training's auc: 0.79968	valid_1's auc: 0.777815
[318]	training's auc: 0.799762	valid_1's auc: 0.777848
[319]	training's auc: 0.799806	valid_1's auc: 0.777839
[320]	training's auc: 0.799879	valid_1's auc: 0.777847
[321]	train

[454]	training's auc: 0.810775	valid_1's auc: 0.77949
[455]	training's auc: 0.810853	valid_1's auc: 0.779454
[456]	training's auc: 0.810933	valid_1's auc: 0.779423
[457]	training's auc: 0.811007	valid_1's auc: 0.779416
[458]	training's auc: 0.811045	valid_1's auc: 0.779418
[459]	training's auc: 0.811089	valid_1's auc: 0.77939
[460]	training's auc: 0.811111	valid_1's auc: 0.779404
[461]	training's auc: 0.811147	valid_1's auc: 0.779412
[462]	training's auc: 0.81121	valid_1's auc: 0.779435
[463]	training's auc: 0.811248	valid_1's auc: 0.779458
[464]	training's auc: 0.811384	valid_1's auc: 0.779483
[465]	training's auc: 0.811443	valid_1's auc: 0.77947
[466]	training's auc: 0.811529	valid_1's auc: 0.779419
[467]	training's auc: 0.811618	valid_1's auc: 0.779382
[468]	training's auc: 0.811705	valid_1's auc: 0.779353
[469]	training's auc: 0.811781	valid_1's auc: 0.779325
[470]	training's auc: 0.811895	valid_1's auc: 0.779267
[471]	training's auc: 0.811946	valid_1's auc: 0.779273
[472]	training

In [16]:
# bst = lgb.train(params, trn_lgb, 1000, valid_sets=[trn_lgb, val_lgb], early_stopping_rounds=20,
#                 init_model=bst, 
#                 learning_rates=lambda iter: 0.1 * (0.99 ** iter))

In [17]:
imp_df = pd.DataFrame([bst.feature_importance()], columns=feature_columns, index=['importance']).T.sort_values(by='importance', ascending=False)
imp_df

Unnamed: 0,importance
EXT_SOURCE_3,300
EXT_SOURCE_2,282
EXT_SOURCE_1,237
DAYS_BIRTH,207
prevapp_CNT_PAYMENT,149
AMT_CREDIT,147
AMT_ANNUITY,135
DAYS_EMPLOYED,121
AMT_GOODS_PRICE,119
DAYS_ID_PUBLISH,94


## predict

In [18]:
res_df = pd.DataFrame(bst.predict(X_tst), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].values
res_df[[output_id_col, output_label_col]].to_csv(submission_path, index=False)