# 1. SETTINGS

In [14]:
# libraries
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# pandas options
pd.set_option("display.max_columns", None)

In [16]:
# garbage collection
import gc
gc.enable()

In [17]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [18]:
# random settings
seed = 5

# 2. PREPARATIONS

In [19]:
# import data
train = pd.read_csv("../data/prepared/train_app.csv")
test  = pd.read_csv("../data/prepared/test_app.csv")
y     = pd.read_csv("../data/prepared/y_app.csv")

In [20]:
# check train
train.head()

Unnamed: 0,SK_ID_CURR,app_CNT_CHILDREN,app_AMT_INCOME_TOTAL,app_AMT_CREDIT,app_AMT_ANNUITY,app_AMT_GOODS_PRICE,app_REGION_POPULATION_RELATIVE,app_DAYS_BIRTH,app_DAYS_EMPLOYED,app_DAYS_REGISTRATION,app_DAYS_ID_PUBLISH,app_OWN_CAR_AGE,app_FLAG_MOBIL,app_FLAG_EMP_PHONE,app_FLAG_WORK_PHONE,app_FLAG_CONT_MOBILE,app_FLAG_PHONE,app_FLAG_EMAIL,app_CNT_FAM_MEMBERS,app_REGION_RATING_CLIENT,app_REGION_RATING_CLIENT_W_CITY,app_HOUR_APPR_PROCESS_START,app_REG_REGION_NOT_LIVE_REGION,app_REG_REGION_NOT_WORK_REGION,app_LIVE_REGION_NOT_WORK_REGION,app_REG_CITY_NOT_LIVE_CITY,app_REG_CITY_NOT_WORK_CITY,app_LIVE_CITY_NOT_WORK_CITY,app_EXT_SOURCE_1,app_EXT_SOURCE_2,app_EXT_SOURCE_3,app_APARTMENTS_AVG,app_BASEMENTAREA_AVG,app_YEARS_BEGINEXPLUATATION_AVG,app_YEARS_BUILD_AVG,app_COMMONAREA_AVG,app_ELEVATORS_AVG,app_ENTRANCES_AVG,app_FLOORSMAX_AVG,app_FLOORSMIN_AVG,app_LANDAREA_AVG,app_LIVINGAPARTMENTS_AVG,app_LIVINGAREA_AVG,app_NONLIVINGAPARTMENTS_AVG,app_NONLIVINGAREA_AVG,app_YEARS_BUILD_MODE,app_OBS_30_CNT_SOCIAL_CIRCLE,app_DEF_30_CNT_SOCIAL_CIRCLE,app_OBS_60_CNT_SOCIAL_CIRCLE,app_DEF_60_CNT_SOCIAL_CIRCLE,app_DAYS_LAST_PHONE_CHANGE,app_FLAG_DOCUMENT_2,app_FLAG_DOCUMENT_3,app_FLAG_DOCUMENT_4,app_FLAG_DOCUMENT_5,app_FLAG_DOCUMENT_6,app_FLAG_DOCUMENT_7,app_FLAG_DOCUMENT_8,app_FLAG_DOCUMENT_9,app_FLAG_DOCUMENT_10,app_FLAG_DOCUMENT_11,app_FLAG_DOCUMENT_12,app_FLAG_DOCUMENT_13,app_FLAG_DOCUMENT_14,app_FLAG_DOCUMENT_15,app_FLAG_DOCUMENT_16,app_FLAG_DOCUMENT_17,app_FLAG_DOCUMENT_18,app_FLAG_DOCUMENT_19,app_FLAG_DOCUMENT_20,app_FLAG_DOCUMENT_21,app_AMT_REQ_CREDIT_BUREAU_HOUR,app_AMT_REQ_CREDIT_BUREAU_DAY,app_AMT_REQ_CREDIT_BUREAU_WEEK,app_AMT_REQ_CREDIT_BUREAU_MON,app_AMT_REQ_CREDIT_BUREAU_QRT,app_AMT_REQ_CREDIT_BUREAU_YEAR,app_CREDIT_BY_INCOME,app_ANNUITY_BY_INCOME,app_GOODS_PRICE_BY_INCOME,app_PERCENT_WORKED,app_CNT_ADULTS,app_NUM_EXT_SOURCES,app_NUM_DOCUMENTS,app_ISNULL_BURO_ENQUIRIES,app_ISNULL_SOCIAL_CIRCLE,app_NAME_CONTRACT_TYPE_Revolving loans,app_CODE_GENDER_M,app_CODE_GENDER_XNA,app_FLAG_OWN_CAR_Y,app_FLAG_OWN_REALTY_Y,app_NAME_TYPE_SUITE_Family,app_NAME_TYPE_SUITE_Group of people,app_NAME_TYPE_SUITE_Other_A,app_NAME_TYPE_SUITE_Other_B,"app_NAME_TYPE_SUITE_Spouse, partner",app_NAME_TYPE_SUITE_Unaccompanied,app_NAME_INCOME_TYPE_Commercial associate,app_NAME_INCOME_TYPE_Maternity leave,app_NAME_INCOME_TYPE_Pensioner,app_NAME_INCOME_TYPE_State servant,app_NAME_INCOME_TYPE_Student,app_NAME_INCOME_TYPE_Unemployed,app_NAME_INCOME_TYPE_Working,app_NAME_EDUCATION_TYPE_Higher education,app_NAME_EDUCATION_TYPE_Incomplete higher,app_NAME_EDUCATION_TYPE_Lower secondary,app_NAME_EDUCATION_TYPE_Secondary / secondary special,app_NAME_FAMILY_STATUS_Married,app_NAME_FAMILY_STATUS_Separated,app_NAME_FAMILY_STATUS_Single / not married,app_NAME_FAMILY_STATUS_Unknown,app_NAME_FAMILY_STATUS_Widow,app_NAME_HOUSING_TYPE_House / apartment,app_NAME_HOUSING_TYPE_Municipal apartment,app_NAME_HOUSING_TYPE_Office apartment,app_NAME_HOUSING_TYPE_Rented apartment,app_NAME_HOUSING_TYPE_With parents,app_OCCUPATION_TYPE_Cleaning staff,app_OCCUPATION_TYPE_Cooking staff,app_OCCUPATION_TYPE_Core staff,app_OCCUPATION_TYPE_Drivers,app_OCCUPATION_TYPE_HR staff,app_OCCUPATION_TYPE_High skill tech staff,app_OCCUPATION_TYPE_IT staff,app_OCCUPATION_TYPE_Laborers,app_OCCUPATION_TYPE_Low-skill Laborers,app_OCCUPATION_TYPE_Managers,app_OCCUPATION_TYPE_Medicine staff,app_OCCUPATION_TYPE_Private service staff,app_OCCUPATION_TYPE_Realty agents,app_OCCUPATION_TYPE_Sales staff,app_OCCUPATION_TYPE_Secretaries,app_OCCUPATION_TYPE_Security staff,app_OCCUPATION_TYPE_Waiters/barmen staff,app_WEEKDAY_APPR_PROCESS_START_MONDAY,app_WEEKDAY_APPR_PROCESS_START_SATURDAY,app_WEEKDAY_APPR_PROCESS_START_SUNDAY,app_WEEKDAY_APPR_PROCESS_START_THURSDAY,app_WEEKDAY_APPR_PROCESS_START_TUESDAY,app_WEEKDAY_APPR_PROCESS_START_WEDNESDAY,app_ORGANIZATION_TYPE_Agriculture,app_ORGANIZATION_TYPE_Bank,app_ORGANIZATION_TYPE_Business Entity Type 1,app_ORGANIZATION_TYPE_Business Entity Type 2,app_ORGANIZATION_TYPE_Business Entity Type 3,app_ORGANIZATION_TYPE_Cleaning,app_ORGANIZATION_TYPE_Construction,app_ORGANIZATION_TYPE_Culture,app_ORGANIZATION_TYPE_Electricity,app_ORGANIZATION_TYPE_Emergency,app_ORGANIZATION_TYPE_Government,app_ORGANIZATION_TYPE_Hotel,app_ORGANIZATION_TYPE_Housing,app_ORGANIZATION_TYPE_Industry: type 1,app_ORGANIZATION_TYPE_Industry: type 10,app_ORGANIZATION_TYPE_Industry: type 11,app_ORGANIZATION_TYPE_Industry: type 12,app_ORGANIZATION_TYPE_Industry: type 13,app_ORGANIZATION_TYPE_Industry: type 2,app_ORGANIZATION_TYPE_Industry: type 3,app_ORGANIZATION_TYPE_Industry: type 4,app_ORGANIZATION_TYPE_Industry: type 5,app_ORGANIZATION_TYPE_Industry: type 6,app_ORGANIZATION_TYPE_Industry: type 7,app_ORGANIZATION_TYPE_Industry: type 8,app_ORGANIZATION_TYPE_Industry: type 9,app_ORGANIZATION_TYPE_Insurance,app_ORGANIZATION_TYPE_Kindergarten,app_ORGANIZATION_TYPE_Legal Services,app_ORGANIZATION_TYPE_Medicine,app_ORGANIZATION_TYPE_Military,app_ORGANIZATION_TYPE_Mobile,app_ORGANIZATION_TYPE_Other,app_ORGANIZATION_TYPE_Police,app_ORGANIZATION_TYPE_Postal,app_ORGANIZATION_TYPE_Realtor,app_ORGANIZATION_TYPE_Religion,app_ORGANIZATION_TYPE_Restaurant,app_ORGANIZATION_TYPE_School,app_ORGANIZATION_TYPE_Security,app_ORGANIZATION_TYPE_Security Ministries,app_ORGANIZATION_TYPE_Self-employed,app_ORGANIZATION_TYPE_Services,app_ORGANIZATION_TYPE_Telecom,app_ORGANIZATION_TYPE_Trade: type 1,app_ORGANIZATION_TYPE_Trade: type 2,app_ORGANIZATION_TYPE_Trade: type 3,app_ORGANIZATION_TYPE_Trade: type 4,app_ORGANIZATION_TYPE_Trade: type 5,app_ORGANIZATION_TYPE_Trade: type 6,app_ORGANIZATION_TYPE_Trade: type 7,app_ORGANIZATION_TYPE_Transport: type 1,app_ORGANIZATION_TYPE_Transport: type 2,app_ORGANIZATION_TYPE_Transport: type 3,app_ORGANIZATION_TYPE_Transport: type 4,app_ORGANIZATION_TYPE_University,app_ORGANIZATION_TYPE_XNA,app_FONDKAPREMONT_MODE_org spec account,app_FONDKAPREMONT_MODE_reg oper account,app_FONDKAPREMONT_MODE_reg oper spec account,app_HOUSETYPE_MODE_specific housing,app_HOUSETYPE_MODE_terraced house,app_WALLSMATERIAL_MODE_Mixed,app_WALLSMATERIAL_MODE_Monolithic,app_WALLSMATERIAL_MODE_Others,app_WALLSMATERIAL_MODE_Panel,"app_WALLSMATERIAL_MODE_Stone, brick",app_WALLSMATERIAL_MODE_Wooden,app_EMERGENCYSTATE_MODE_Yes,app_DAY_APPR_PROCESS_START_Working day
0,100002,0,12.2185,12.915581,10.114619,12.768544,0.018801,315.0,21.0,122.0,71.0,,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.6341,2.0,2.0,2.0,2.0,38.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,2.007889,0.121978,1.733333,0.067329,1.0,3,1,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
1,100003,0,12.506181,14.072865,10.482892,13.937287,0.003541,559.0,40.0,40.0,10.0,,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.804,1.0,0.0,1.0,0.0,28.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,4.79075,0.132217,4.183333,0.070862,2.0,2,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
2,100004,0,11.119898,11.813037,8.817446,11.813037,0.010032,635.0,8.0,142.0,84.0,26.0,1,1,1,1,1,0,1.0,2,2,9,0,0,0,0,0,0,,0.555912,0.729567,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,27.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.1,2.0,0.011813,1.0,2,0,1,1,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,100006,0,11.813037,12.652947,10.298481,12.601491,0.008019,634.0,101.0,328.0,81.0,,1,1,0,1,0,0,2.0,2,2,17,0,0,0,0,0,0,,0.650442,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,21.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,2.316167,0.2199,2.2,0.159905,2.0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,100007,0,11.707678,13.148033,9.992711,13.148033,0.028663,664.0,101.0,144.0,115.0,,1,1,0,1,0,0,1.0,2,2,11,0,0,0,0,1,1,,0.322738,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,37.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,4.222222,0.179963,4.222222,0.152418,1.0,1,1,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [21]:
# check test
test.head()

Unnamed: 0,SK_ID_CURR,app_CNT_CHILDREN,app_AMT_INCOME_TOTAL,app_AMT_CREDIT,app_AMT_ANNUITY,app_AMT_GOODS_PRICE,app_REGION_POPULATION_RELATIVE,app_DAYS_BIRTH,app_DAYS_EMPLOYED,app_DAYS_REGISTRATION,app_DAYS_ID_PUBLISH,app_OWN_CAR_AGE,app_FLAG_MOBIL,app_FLAG_EMP_PHONE,app_FLAG_WORK_PHONE,app_FLAG_CONT_MOBILE,app_FLAG_PHONE,app_FLAG_EMAIL,app_CNT_FAM_MEMBERS,app_REGION_RATING_CLIENT,app_REGION_RATING_CLIENT_W_CITY,app_HOUR_APPR_PROCESS_START,app_REG_REGION_NOT_LIVE_REGION,app_REG_REGION_NOT_WORK_REGION,app_LIVE_REGION_NOT_WORK_REGION,app_REG_CITY_NOT_LIVE_CITY,app_REG_CITY_NOT_WORK_CITY,app_LIVE_CITY_NOT_WORK_CITY,app_EXT_SOURCE_1,app_EXT_SOURCE_2,app_EXT_SOURCE_3,app_APARTMENTS_AVG,app_BASEMENTAREA_AVG,app_YEARS_BEGINEXPLUATATION_AVG,app_YEARS_BUILD_AVG,app_COMMONAREA_AVG,app_ELEVATORS_AVG,app_ENTRANCES_AVG,app_FLOORSMAX_AVG,app_FLOORSMIN_AVG,app_LANDAREA_AVG,app_LIVINGAPARTMENTS_AVG,app_LIVINGAREA_AVG,app_NONLIVINGAPARTMENTS_AVG,app_NONLIVINGAREA_AVG,app_YEARS_BUILD_MODE,app_OBS_30_CNT_SOCIAL_CIRCLE,app_DEF_30_CNT_SOCIAL_CIRCLE,app_OBS_60_CNT_SOCIAL_CIRCLE,app_DEF_60_CNT_SOCIAL_CIRCLE,app_DAYS_LAST_PHONE_CHANGE,app_FLAG_DOCUMENT_2,app_FLAG_DOCUMENT_3,app_FLAG_DOCUMENT_4,app_FLAG_DOCUMENT_5,app_FLAG_DOCUMENT_6,app_FLAG_DOCUMENT_7,app_FLAG_DOCUMENT_8,app_FLAG_DOCUMENT_9,app_FLAG_DOCUMENT_10,app_FLAG_DOCUMENT_11,app_FLAG_DOCUMENT_12,app_FLAG_DOCUMENT_13,app_FLAG_DOCUMENT_14,app_FLAG_DOCUMENT_15,app_FLAG_DOCUMENT_16,app_FLAG_DOCUMENT_17,app_FLAG_DOCUMENT_18,app_FLAG_DOCUMENT_19,app_FLAG_DOCUMENT_20,app_FLAG_DOCUMENT_21,app_AMT_REQ_CREDIT_BUREAU_HOUR,app_AMT_REQ_CREDIT_BUREAU_DAY,app_AMT_REQ_CREDIT_BUREAU_WEEK,app_AMT_REQ_CREDIT_BUREAU_MON,app_AMT_REQ_CREDIT_BUREAU_QRT,app_AMT_REQ_CREDIT_BUREAU_YEAR,app_CREDIT_BY_INCOME,app_ANNUITY_BY_INCOME,app_GOODS_PRICE_BY_INCOME,app_PERCENT_WORKED,app_CNT_ADULTS,app_NUM_EXT_SOURCES,app_NUM_DOCUMENTS,app_ISNULL_BURO_ENQUIRIES,app_ISNULL_SOCIAL_CIRCLE,app_NAME_CONTRACT_TYPE_Revolving loans,app_CODE_GENDER_M,app_CODE_GENDER_XNA,app_FLAG_OWN_CAR_Y,app_FLAG_OWN_REALTY_Y,app_NAME_TYPE_SUITE_Family,app_NAME_TYPE_SUITE_Group of people,app_NAME_TYPE_SUITE_Other_A,app_NAME_TYPE_SUITE_Other_B,"app_NAME_TYPE_SUITE_Spouse, partner",app_NAME_TYPE_SUITE_Unaccompanied,app_NAME_INCOME_TYPE_Commercial associate,app_NAME_INCOME_TYPE_Maternity leave,app_NAME_INCOME_TYPE_Pensioner,app_NAME_INCOME_TYPE_State servant,app_NAME_INCOME_TYPE_Student,app_NAME_INCOME_TYPE_Unemployed,app_NAME_INCOME_TYPE_Working,app_NAME_EDUCATION_TYPE_Higher education,app_NAME_EDUCATION_TYPE_Incomplete higher,app_NAME_EDUCATION_TYPE_Lower secondary,app_NAME_EDUCATION_TYPE_Secondary / secondary special,app_NAME_FAMILY_STATUS_Married,app_NAME_FAMILY_STATUS_Separated,app_NAME_FAMILY_STATUS_Single / not married,app_NAME_FAMILY_STATUS_Unknown,app_NAME_FAMILY_STATUS_Widow,app_NAME_HOUSING_TYPE_House / apartment,app_NAME_HOUSING_TYPE_Municipal apartment,app_NAME_HOUSING_TYPE_Office apartment,app_NAME_HOUSING_TYPE_Rented apartment,app_NAME_HOUSING_TYPE_With parents,app_OCCUPATION_TYPE_Cleaning staff,app_OCCUPATION_TYPE_Cooking staff,app_OCCUPATION_TYPE_Core staff,app_OCCUPATION_TYPE_Drivers,app_OCCUPATION_TYPE_HR staff,app_OCCUPATION_TYPE_High skill tech staff,app_OCCUPATION_TYPE_IT staff,app_OCCUPATION_TYPE_Laborers,app_OCCUPATION_TYPE_Low-skill Laborers,app_OCCUPATION_TYPE_Managers,app_OCCUPATION_TYPE_Medicine staff,app_OCCUPATION_TYPE_Private service staff,app_OCCUPATION_TYPE_Realty agents,app_OCCUPATION_TYPE_Sales staff,app_OCCUPATION_TYPE_Secretaries,app_OCCUPATION_TYPE_Security staff,app_OCCUPATION_TYPE_Waiters/barmen staff,app_WEEKDAY_APPR_PROCESS_START_MONDAY,app_WEEKDAY_APPR_PROCESS_START_SATURDAY,app_WEEKDAY_APPR_PROCESS_START_SUNDAY,app_WEEKDAY_APPR_PROCESS_START_THURSDAY,app_WEEKDAY_APPR_PROCESS_START_TUESDAY,app_WEEKDAY_APPR_PROCESS_START_WEDNESDAY,app_ORGANIZATION_TYPE_Agriculture,app_ORGANIZATION_TYPE_Bank,app_ORGANIZATION_TYPE_Business Entity Type 1,app_ORGANIZATION_TYPE_Business Entity Type 2,app_ORGANIZATION_TYPE_Business Entity Type 3,app_ORGANIZATION_TYPE_Cleaning,app_ORGANIZATION_TYPE_Construction,app_ORGANIZATION_TYPE_Culture,app_ORGANIZATION_TYPE_Electricity,app_ORGANIZATION_TYPE_Emergency,app_ORGANIZATION_TYPE_Government,app_ORGANIZATION_TYPE_Hotel,app_ORGANIZATION_TYPE_Housing,app_ORGANIZATION_TYPE_Industry: type 1,app_ORGANIZATION_TYPE_Industry: type 10,app_ORGANIZATION_TYPE_Industry: type 11,app_ORGANIZATION_TYPE_Industry: type 12,app_ORGANIZATION_TYPE_Industry: type 13,app_ORGANIZATION_TYPE_Industry: type 2,app_ORGANIZATION_TYPE_Industry: type 3,app_ORGANIZATION_TYPE_Industry: type 4,app_ORGANIZATION_TYPE_Industry: type 5,app_ORGANIZATION_TYPE_Industry: type 6,app_ORGANIZATION_TYPE_Industry: type 7,app_ORGANIZATION_TYPE_Industry: type 8,app_ORGANIZATION_TYPE_Industry: type 9,app_ORGANIZATION_TYPE_Insurance,app_ORGANIZATION_TYPE_Kindergarten,app_ORGANIZATION_TYPE_Legal Services,app_ORGANIZATION_TYPE_Medicine,app_ORGANIZATION_TYPE_Military,app_ORGANIZATION_TYPE_Mobile,app_ORGANIZATION_TYPE_Other,app_ORGANIZATION_TYPE_Police,app_ORGANIZATION_TYPE_Postal,app_ORGANIZATION_TYPE_Realtor,app_ORGANIZATION_TYPE_Religion,app_ORGANIZATION_TYPE_Restaurant,app_ORGANIZATION_TYPE_School,app_ORGANIZATION_TYPE_Security,app_ORGANIZATION_TYPE_Security Ministries,app_ORGANIZATION_TYPE_Self-employed,app_ORGANIZATION_TYPE_Services,app_ORGANIZATION_TYPE_Telecom,app_ORGANIZATION_TYPE_Trade: type 1,app_ORGANIZATION_TYPE_Trade: type 2,app_ORGANIZATION_TYPE_Trade: type 3,app_ORGANIZATION_TYPE_Trade: type 4,app_ORGANIZATION_TYPE_Trade: type 5,app_ORGANIZATION_TYPE_Trade: type 6,app_ORGANIZATION_TYPE_Trade: type 7,app_ORGANIZATION_TYPE_Transport: type 1,app_ORGANIZATION_TYPE_Transport: type 2,app_ORGANIZATION_TYPE_Transport: type 3,app_ORGANIZATION_TYPE_Transport: type 4,app_ORGANIZATION_TYPE_University,app_ORGANIZATION_TYPE_XNA,app_FONDKAPREMONT_MODE_org spec account,app_FONDKAPREMONT_MODE_reg oper account,app_FONDKAPREMONT_MODE_reg oper spec account,app_HOUSETYPE_MODE_specific housing,app_HOUSETYPE_MODE_terraced house,app_WALLSMATERIAL_MODE_Mixed,app_WALLSMATERIAL_MODE_Monolithic,app_WALLSMATERIAL_MODE_Others,app_WALLSMATERIAL_MODE_Panel,"app_WALLSMATERIAL_MODE_Stone, brick",app_WALLSMATERIAL_MODE_Wooden,app_EMERGENCYSTATE_MODE_Yes,app_DAY_APPR_PROCESS_START_Working day
0,100001,0,11.813037,13.251286,9.931176,13.017005,0.01885,641.0,78.0,172.0,27.0,,1,1,0,1,0,1,2.0,2,2,18,0,0,0,0,0,0,0.752614,0.789654,0.15952,0.066,0.059,0.9732,,,,0.1379,0.125,,,,0.0505,,,,0.0,0.0,0.0,0.0,58.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,4.213333,0.1523,3.333333,0.121044,2.0,3,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,100005,0,11.502885,12.313891,9.762557,12.100718,0.035792,602.0,149.0,304.0,54.0,,1,1,0,1,0,0,2.0,2,2,9,0,0,0,0,0,0,0.56499,0.291656,0.432962,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,2.250182,0.175455,1.818182,0.247398,2.0,3,1,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,100013,0,12.2185,13.40493,11.153074,13.353477,0.019101,668.0,149.0,72.0,117.0,5.0,1,1,0,1,0,0,2.0,2,2,14,0,0,0,0,0,0,,0.699787,0.610991,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,29.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0,3.275378,0.344578,3.111111,0.222477,2.0,2,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,100028,2,12.660331,14.269766,10.799973,14.269766,0.026392,466.0,62.0,67.0,140.0,,1,1,0,1,1,0,4.0,2,2,11,0,0,0,0,0,0,0.525734,0.509677,0.612704,0.3052,0.1974,0.997,0.9592,0.1165,0.32,0.2759,0.375,0.0417,0.2042,0.2404,0.3673,0.0386,0.08,0.9608,0.0,0.0,0.0,0.0,60.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,0.155614,5.0,0.133515,2.0,3,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
4,100038,1,12.100718,13.346308,10.375614,13.346308,0.010032,435.0,73.0,133.0,142.0,16.0,1,1,1,1,0,0,3.0,2,2,5,0,0,0,0,1,1,0.202145,0.425687,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,27.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,3.475,0.17815,3.475,0.168021,2.0,2,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [22]:
# extract target
y = y["TARGET"]

In [29]:
# exclude features
excluded_feats = ["SK_ID_CURR", "app_EXT_SOURCE_1", "app_EXT_SOURCE_2", "app_EXT_SOURCE_3"]
features = [f for f in train.columns if f not in excluded_feats]

In [30]:
# check dimensions
print(train[features].shape)
print(test[features].shape)

(307511, 207)
(48744, 207)


In [31]:
### PARAMETERS

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = 5000,
                         learning_rate    = 0.005,
                         num_leaves       = 70,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 7,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.01,
                         min_child_weight = 2,
                         random_state     = seed)

# learner settings
metric   = "auc"
verbose  = 500
stopping = 200

# CV settings
num_folds = 5
shuffle   = True

# 3. CROSS-VALIDATION

In [32]:
# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

# placeholders
valid_aucs_cv = np.zeros(num_folds) 
test_preds_cv = np.zeros(test.shape[0])
feature_importance_df = pd.DataFrame()

In [34]:
### CROSS-VALIDATION LOOP
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    
    # train lightGBM
    gbm = gbm.fit(trn_x, trn_y, 
                  eval_set = [(trn_x, trn_y), (val_x, val_y)], 
                  eval_metric = metric, 
                  verbose = verbose, 
                  early_stopping_rounds = stopping)
    
    # save number of iterations
    num_iter_cv = gbm.best_iteration_
    
    # predictions
    valid_preds_cv = gbm.predict_proba(val_x, num_iteration = num_iter_cv)[:, 1]
    valid_aucs_cv[n_fold] = roc_auc_score(val_y, valid_preds_cv)
    test_preds_cv += gbm.predict_proba(test[features], num_iteration = num_iter_cv)[:, 1] / folds.n_splits
    
    # variable importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = gbm.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis = 0)
    
    # print performance
    print("----------------------")
    print("Fold%2d AUC: %.6f" % (n_fold + 1, valid_aucs_cv[n_fold]))
    print("----------------------")
    print("")

    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
# print overall performance    
print("Cross-Validation AUC score %.6f" % np.mean(valid_aucs_cv))

# APP ONLY
# raw label: 0.761581
# prp label: 0.762234
# prp dummy: 0.762682
# no merger: 0.762768

# APP + PREV
# prp dummy: 0.774839
# rjct rtio: 0.775147
# day round: 0.775385
# 3 pratios: 0.775492
# med & sum: 0.775304
# 3mix+arat: 0.775527

# APP + PREV + BURO
# first prp: 0.783162
# 7000 iter: 0.783302

# APP + PREV + BURO + BBAL
# bbal3vars: 0.783404
# del es123: 0.783404

Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.713709	valid_1's auc: 0.681583
[1000]	training's auc: 0.748365	valid_1's auc: 0.696951
[1500]	training's auc: 0.774651	valid_1's auc: 0.702946
[2000]	training's auc: 0.794492	valid_1's auc: 0.705457
[2500]	training's auc: 0.811241	valid_1's auc: 0.706965
[3000]	training's auc: 0.825303	valid_1's auc: 0.707634
[3500]	training's auc: 0.837611	valid_1's auc: 0.707927
[4000]	training's auc: 0.849117	valid_1's auc: 0.708131
Early stopping, best iteration is:
[3950]	training's auc: 0.848081	valid_1's auc: 0.708174
----------------------
Fold 1 AUC: 0.708174
----------------------

Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.713777	valid_1's auc: 0.683673
[1000]	training's auc: 0.748405	valid_1's auc: 0.698689
[1500]	training's auc: 0.774656	valid_1's auc: 0.704393
[2000]	training's auc: 0.794822	valid_1's auc: 0.707107
[2500]	training's auc: 0.812437	valid_1's auc

In [None]:
# plot variable importance
top_k = 100
cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by = "importance", ascending = False)[:top_k].index
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize = (10, 18))
sns.barplot(x = "importance", y = "feature", data = best_features.sort_values(by = "importance", ascending = False))
plt.title('LightGBM Feature Importance (mean over folds)')
plt.tight_layout()
plt.savefig('../files/lgb_varimp_100_app_prev_buro_bbal.png')

In [29]:
# check full importance ranking
cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by = "importance", ascending = False)
cols["rank"] = range(1, len(cols) + 1)
with pd.option_context('display.max_rows', None):
    print(cols)

                                 importance  rank
feature                                          
app_EXT_SOURCE_1                    14623.4     1
app_EXT_SOURCE_3                    14170.6     2
app_EXT_SOURCE_2                    12608.8     3
app_DAYS_BIRTH                      10693.6     4
app_DAYS_ID_PUBLISH                  8853.2     5
app_AMT_ANNUITY                      8747.0     6
app_PERCENT_WORKED                   8669.0     7
app_DAYS_EMPLOYED                    8415.4     8
app_DAYS_REGISTRATION                8383.0     9
app_AMT_CREDIT                       8284.6    10
app_AMT_GOODS_PRICE                  7435.6    11
app_ANNUITY_BY_INCOME                7349.4    12
app_DAYS_LAST_PHONE_CHANGE           6803.4    13
app_CREDIT_BY_INCOME                 5855.6    14
app_GOODS_PRICE_BY_INCOME            5722.6    15
app_REGION_POPULATION_RELATIVE       5633.2    16
app_OWN_CAR_AGE                      5591.8    17
app_LANDAREA_AVG                     4730.0    18


# 3. SUBMISSION

In [37]:
# create submission
test["TARGET"] = test_preds_cv
subm = test[["SK_ID_CURR", "TARGET"]]

# export CSV
subm.to_csv("../submissions/lgb_bag_5cv_app_noext_0706446.csv", index = False, float_format = "%.8f")