In [1]:
# coding: utf-8
import os

import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid

from mydatools.plot import plot_multiclass_feature_dist
from mydatools.features_generate import mean_encoding

% matplotlib inline

## Config

In [2]:
trn_path = './data/input/application_train.csv'
tst_path = './data/input/application_test.csv'
id_col = 'SK_ID_CURR'
label_col = 'TARGET'

submission_path = './data/output/submission/EDA.csv'
output_id_col = id_col
output_label_col = label_col

## Load Data

In [3]:
trn_df = pd.read_csv(trn_path)
trn_df['ds_type'] = 'trn'
trn_df[label_col] = trn_df[label_col].astype('int')

tst_df = pd.read_csv(tst_path)
tst_df['ds_type'] = 'tst'

full_df = pd.concat([trn_df, tst_df])

del(trn_df)
del(tst_df)

full_df.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,ds_type
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0149,"Stone, brick",WEDNESDAY,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,trn
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0714,Block,MONDAY,0.9851,0.9851,0.9851,0.796,0.7987,0.804,trn
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,MONDAY,,,,,,,trn
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,,,WEDNESDAY,,,,,,,trn
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,THURSDAY,,,,,,,trn


In [4]:
# dataset type
is_trn = full_df['ds_type'] == 'trn'
is_tst = full_df['ds_type'] == 'tst'

In [5]:
full_df.shape

(356255, 123)

**label非常不平衡，大约10: 1**

In [6]:
full_df[label_col].value_counts()

0.0    282686
1.0     24825
Name: TARGET, dtype: int64

## Features

In [7]:
feature_columns = []

def add_features(features):
    if not isinstance(features, list):
        features = [features]
    global feature_columns
    feature_columns.extend([f for f in features if f not in feature_columns])

### application_train/test

**numerical_features**

In [8]:
numerical_features = full_df.dtypes[full_df.dtypes != 'object'].index.tolist()
numerical_features = [c for c in numerical_features if c not in [id_col, label_col, 'ds_type']]
add_features(numerical_features)

In [9]:
full_df[numerical_features].describe()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,REG_CITY_NOT_WORK_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,TOTALAREA_MODE,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE
count,356219.0,356255.0,355977.0,356255.0,308687.0,308687.0,308687.0,308687.0,308687.0,308687.0,...,356255.0,356255.0,356255.0,185200.0,183392.0,183392.0,183392.0,119949.0,119949.0,119949.0
mean,27425.560657,587767.4,528020.0,170116.1,0.006281,0.005808,0.231697,0.304399,0.029995,1.911564,...,0.229661,0.015649,0.051371,0.103193,0.977889,0.977903,0.977239,0.752283,0.755548,0.759452
std,14732.80819,398623.7,366065.0,223506.8,0.10425,0.079736,0.855949,0.786915,0.191374,1.865338,...,0.420616,0.124113,0.220753,0.108041,0.057929,0.058562,0.063165,0.113267,0.112057,0.110112
min,1615.5,45000.0,40500.0,25650.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16731.0,270000.0,234000.0,112500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0415,0.9767,0.9767,0.9767,0.6872,0.6914,0.6994
50%,25078.5,500211.0,450000.0,153000.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.069,0.9816,0.9816,0.9816,0.7552,0.7585,0.7648
75%,34960.5,797557.5,675000.0,202500.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.1287,0.9866,0.9866,0.9866,0.8232,0.8256,0.8236
max,258025.5,4050000.0,4050000.0,117000000.0,9.0,4.0,27.0,261.0,8.0,25.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


**categorical_features**

In [10]:
categorical_features = full_df.dtypes[full_df.dtypes == 'object'].index.tolist()
full_df[categorical_features].nunique()

CODE_GENDER                    3
EMERGENCYSTATE_MODE            2
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
NAME_CONTRACT_TYPE             2
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
NAME_INCOME_TYPE               8
NAME_TYPE_SUITE                7
OCCUPATION_TYPE               18
ORGANIZATION_TYPE             58
WALLSMATERIAL_MODE             7
WEEKDAY_APPR_PROCESS_START     7
ds_type                        2
dtype: int64

In [11]:
categorical_features = categorical_features[:-1] # 为了去掉ds_type
# 去掉分类数太多
# categorical_features = full_df[categorical_features].columns[full_df[categorical_features].nunique() <= 10]

# get dummies
temp_df = full_df[categorical_features].copy()
temp_df = pd.get_dummies(temp_df, prefix_sep=':')
for c in temp_df.columns:
    full_df[c] = temp_df[c]

# add features
add_features(temp_df.columns.tolist())

增加分类变量的mean-encoding，因为训练集和测试集完全随机分割，随意直接使用最普通的mean-encoding应该也不太会出现过拟合

In [12]:
for c in categorical_features:
    full_df, new_feat = mean_encoding(full_df, c, label_col, is_tst, return_feature_name=True)
    add_features(new_feat)

**各收入项占贷款比例**

In [13]:
full_df['AMT_INCOME_TOTAL_device_AMT_GOODS_PRICE'] = full_df['AMT_INCOME_TOTAL'] / full_df['AMT_GOODS_PRICE']
full_df['AMT_CREDIT_device_AMT_GOODS_PRICE'] = full_df['AMT_CREDIT'] / full_df['AMT_GOODS_PRICE']
full_df['AMT_ANNUITY_device_AMT_GOODS_PRICE'] = full_df['AMT_ANNUITY'] / full_df['AMT_GOODS_PRICE']

new_features = [
    'AMT_INCOME_TOTAL_device_AMT_GOODS_PRICE',
    'AMT_CREDIT_device_AMT_GOODS_PRICE',
    'AMT_ANNUITY_device_AMT_GOODS_PRICE',
]
add_features(new_features)

In [14]:
full_df[new_features].describe()

Unnamed: 0,AMT_INCOME_TOTAL_device_AMT_GOODS_PRICE,AMT_CREDIT_device_AMT_GOODS_PRICE,AMT_ANNUITY_device_AMT_GOODS_PRICE
count,355977.0,355977.0,355941.0
mean,0.459271,1.124213,0.061961
std,0.581067,0.124172,0.025477
min,0.011801,0.15,0.0075
25%,0.225,1.0,0.043245
50%,0.35,1.1188,0.054095
75%,0.5625,1.198,0.07592
max,257.425743,6.0,0.3


### add extra data

In [15]:
def summary_extra_data(df, suffix):
    # 分类特征get_dummies
    df = pd.get_dummies(df, columns=df.columns[df.dtypes == 'object'].tolist())
    df = df.fillna(0)

    # 求平均
    df = pd.concat([
        df.groupby('SK_ID_CURR').mean(),
        df.groupby('SK_ID_CURR').size().rename('cnt'),
    ], axis=1)

    # 给特征加上前缀
    df.columns = [suffix+f for f in df.columns.tolist()]

    return df

In [16]:
# bureau = pd.read_csv('./data/input/bureau.csv')
# test_df = bureau.iloc[:100, :5]
# test_df.head()

# summary_extra_data(test_df, 'test')

**bureau.csv**

In [17]:
bureau = pd.read_csv('./data/input/bureau.csv')
bureau = bureau.drop('SK_ID_BUREAU', axis=1)

bureau = summary_extra_data(bureau, 'bureau_')

full_df = full_df.join(bureau, on='SK_ID_CURR')
add_features(bureau.columns.tolist())

**previous_application.csv**

In [18]:
prevapp = pd.read_csv('./data/input/previous_application.csv')
prevapp = prevapp.drop('SK_ID_PREV', axis=1)

prevapp = summary_extra_data(prevapp, 'prevapp_')

full_df = full_df.join(prevapp, on='SK_ID_CURR')
add_features(prevapp.columns.tolist())

**POS_CASH_balance.csv**

In [19]:
pcblc = pd.read_csv('./data/input/POS_CASH_balance.csv')
pcblc = pcblc.drop(['SK_ID_PREV', 'MONTHS_BALANCE'], axis=1)

pcblc = summary_extra_data(pcblc, 'pcblc_')

full_df = full_df.join(pcblc, on='SK_ID_CURR')
add_features(pcblc.columns.tolist())

**credit_card_balance.csv**

In [20]:
ccblc = pd.read_csv('./data/input/credit_card_balance.csv')
ccblc = ccblc.drop(['SK_ID_PREV', 'MONTHS_BALANCE'], axis=1)

ccblc = summary_extra_data(ccblc, 'ccblc_')

full_df = full_df.join(ccblc, on='SK_ID_CURR')
add_features(ccblc.columns.tolist())

**installments_payments.csv**

In [21]:
installpay = pd.read_csv('./data/input/installments_payments.csv')
installpay = installpay.drop(['SK_ID_PREV'], axis=1)

installpay = summary_extra_data(installpay, 'installpay_')

full_df = full_df.join(installpay, on='SK_ID_CURR')
add_features(installpay.columns.tolist())

In [22]:
len(feature_columns)

510

## LightGBM

In [23]:
trn_df = full_df[is_trn]
tst_df = full_df[is_tst]

X = trn_df[feature_columns]
y = trn_df[label_col]
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.20, random_state=20180521)

X_tst = tst_df[feature_columns]

d_trn = lgb.Dataset(X_trn.values, y_trn)
d_val = lgb.Dataset(X_val.values, y_val)

In [24]:
params = {
    'application': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_depth': 4,
    'num_leaves': 12,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'is_unbalance': True,
    'seed': 0,
    'reg_alpha': 0.1,
}
bst = lgb.train(params, d_trn, 500, valid_sets=[d_trn, d_val], early_stopping_rounds=20)

[1]	training's auc: 0.701144	valid_1's auc: 0.703706
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.707259	valid_1's auc: 0.711033
[3]	training's auc: 0.714239	valid_1's auc: 0.718702
[4]	training's auc: 0.717754	valid_1's auc: 0.721655
[5]	training's auc: 0.728402	valid_1's auc: 0.731766
[6]	training's auc: 0.728375	valid_1's auc: 0.732793
[7]	training's auc: 0.728971	valid_1's auc: 0.73302
[8]	training's auc: 0.728876	valid_1's auc: 0.733186
[9]	training's auc: 0.728319	valid_1's auc: 0.732411
[10]	training's auc: 0.72839	valid_1's auc: 0.732786
[11]	training's auc: 0.730188	valid_1's auc: 0.734288
[12]	training's auc: 0.729872	valid_1's auc: 0.734323
[13]	training's auc: 0.730364	valid_1's auc: 0.734654
[14]	training's auc: 0.731718	valid_1's auc: 0.735945
[15]	training's auc: 0.732049	valid_1's auc: 0.736245
[16]	training's auc: 0.733504	valid_1's auc: 0.737745
[17]	training's auc: 0.73435	valid_1's auc: 0.738621
[18]	training's auc: 0.734511	va

[153]	training's auc: 0.783843	valid_1's auc: 0.777581
[154]	training's auc: 0.783981	valid_1's auc: 0.77766
[155]	training's auc: 0.784048	valid_1's auc: 0.777715
[156]	training's auc: 0.784157	valid_1's auc: 0.777788
[157]	training's auc: 0.784316	valid_1's auc: 0.777895
[158]	training's auc: 0.784441	valid_1's auc: 0.777962
[159]	training's auc: 0.784528	valid_1's auc: 0.777946
[160]	training's auc: 0.78467	valid_1's auc: 0.778029
[161]	training's auc: 0.784799	valid_1's auc: 0.778151
[162]	training's auc: 0.78496	valid_1's auc: 0.778277
[163]	training's auc: 0.785078	valid_1's auc: 0.778336
[164]	training's auc: 0.785226	valid_1's auc: 0.778414
[165]	training's auc: 0.785348	valid_1's auc: 0.778495
[166]	training's auc: 0.785436	valid_1's auc: 0.778466
[167]	training's auc: 0.785588	valid_1's auc: 0.778531
[168]	training's auc: 0.785742	valid_1's auc: 0.778646
[169]	training's auc: 0.785857	valid_1's auc: 0.778666
[170]	training's auc: 0.785985	valid_1's auc: 0.778749
[171]	trainin

[303]	training's auc: 0.79999	valid_1's auc: 0.78333
[304]	training's auc: 0.800096	valid_1's auc: 0.783382
[305]	training's auc: 0.800226	valid_1's auc: 0.783389
[306]	training's auc: 0.800286	valid_1's auc: 0.783426
[307]	training's auc: 0.800347	valid_1's auc: 0.783468
[308]	training's auc: 0.800434	valid_1's auc: 0.783441
[309]	training's auc: 0.800522	valid_1's auc: 0.783477
[310]	training's auc: 0.800598	valid_1's auc: 0.783486
[311]	training's auc: 0.8007	valid_1's auc: 0.783529
[312]	training's auc: 0.800856	valid_1's auc: 0.783633
[313]	training's auc: 0.800949	valid_1's auc: 0.783679
[314]	training's auc: 0.801008	valid_1's auc: 0.783653
[315]	training's auc: 0.801121	valid_1's auc: 0.78369
[316]	training's auc: 0.80118	valid_1's auc: 0.78371
[317]	training's auc: 0.801274	valid_1's auc: 0.783742
[318]	training's auc: 0.801322	valid_1's auc: 0.783728
[319]	training's auc: 0.801378	valid_1's auc: 0.783729
[320]	training's auc: 0.801432	valid_1's auc: 0.783764
[321]	training's 

In [25]:
imp_df = pd.DataFrame([bst.feature_importance()], columns=feature_columns, index=['importance']).T.sort_values(by='importance', ascending=False)
imp_df

Unnamed: 0,importance
EXT_SOURCE_3,185
EXT_SOURCE_2,185
EXT_SOURCE_1,180
AMT_ANNUITY_device_AMT_GOODS_PRICE,123
pcblc_CNT_INSTALMENT_FUTURE,107
DAYS_BIRTH,96
DAYS_EMPLOYED,83
AMT_ANNUITY,83
AMT_CREDIT_device_AMT_GOODS_PRICE,72
installpay_AMT_PAYMENT,67


## predict

In [26]:
res_df = pd.DataFrame(bst.predict(X_tst), columns=[output_label_col])
res_df[output_id_col] = tst_df[output_id_col].values
res_df[[output_id_col, output_label_col]].to_csv(submission_path, index=False)