## Installments

In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
save_files = True

In [3]:
# load installments file
df_inst = pd.read_csv('input/installments_payments.csv')
df_inst.info()
display(df_inst.head())
display(df_inst.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13605401 entries, 0 to 13605400
Data columns (total 8 columns):
SK_ID_PREV                int64
SK_ID_CURR                int64
NUM_INSTALMENT_VERSION    float64
NUM_INSTALMENT_NUMBER     int64
DAYS_INSTALMENT           float64
DAYS_ENTRY_PAYMENT        float64
AMT_INSTALMENT            float64
AMT_PAYMENT               float64
dtypes: float64(5), int64(3)
memory usage: 830.4 MB


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [6]:
df_inst['DAY_DIFF'] = df_inst['DAYS_INSTALMENT']-df_inst['DAYS_ENTRY_PAYMENT']
df_inst['PAY_DIFF'] = df_inst['AMT_INSTALMENT']-df_inst['AMT_PAYMENT']

df_inst.rename(columns={'NUM_INSTALMENT_VERSION':'VERSION',
                        'NUM_INSTALMENT_NUMBER':'NUM',
                        'DAYS_INSTALMENT':'DAY_DUE',
                        'DAYS_ENTRY_PAYMENT':'DAY_PAY',
                        'AMT_INSTALMENT':'AMT',
                        'AMT_PAYMENT':'PAID'},
               inplace=True)

In [7]:
id_prev_to_curr = df_inst.groupby('SK_ID_PREV').SK_ID_CURR.first()

df_inst.drop('SK_ID_CURR',axis=1,inplace=True)

In [14]:
df_inst.sort_values(by=['SK_ID_PREV','DAY_DUE'],inplace=True)
df_inst.reset_index(inplace=True,drop=True)

df_inst.head()

Unnamed: 0,index,SK_ID_PREV,VERSION,NUM,DAY_DUE,DAY_PAY,AMT,PAID,DAYS_DIFF,PAY_DIFF
0,512588,1000001,1.0,1,-268.0,-294.0,6404.31,6404.31,26.0,0.0
1,2159480,1000001,2.0,2,-238.0,-244.0,62039.115,62039.115,6.0,0.0
2,1214732,1000002,1.0,1,-1600.0,-1611.0,6264.0,6264.0,11.0,0.0
3,1631862,1000002,1.0,2,-1570.0,-1575.0,6264.0,6264.0,5.0,0.0
4,3411021,1000002,1.0,3,-1540.0,-1559.0,6264.0,6264.0,19.0,0.0


In [17]:
agg_funcs = {
    'VERSION':['nunique'],
    'NUM':['first'],
    'DAY_DUE':['min','max','first'],
    'DAY_PAY':['min','max','first'],
    'AMT':['min','max','first','mean'],
    'PAID':['min','max','first','mean'],
    'DAYS_DIFF':['min','max','first','mean'],
    'PAY_DIFF':['min','max','first','mean']
}

inst_grouped = df_inst.groupby('SK_ID_PREV').agg(agg_funcs)

inst_grouped.columns = ['INST_'+'_'.join(x) for x in inst_grouped.columns.values]

del df_inst

inst_grouped.head()

Unnamed: 0_level_0,INST_VERSION_nunique,INST_NUM_first,INST_DAY_DUE_min,INST_DAY_DUE_max,INST_DAY_DUE_first,INST_DAY_PAY_min,INST_DAY_PAY_max,INST_DAY_PAY_first,INST_AMT_min,INST_AMT_max,...,INST_PAID_first,INST_PAID_mean,INST_DAYS_DIFF_min,INST_DAYS_DIFF_max,INST_DAYS_DIFF_first,INST_DAYS_DIFF_mean,INST_PAY_DIFF_min,INST_PAY_DIFF_max,INST_PAY_DIFF_first,INST_PAY_DIFF_mean
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000001,2,1,-268.0,-238.0,-268.0,-294.0,-244.0,-294.0,6404.31,62039.115,...,6404.31,34221.7125,6.0,26.0,26.0,16.0,0.0,0.0,0.0,0.0
1000002,2,1,-1600.0,-1510.0,-1600.0,-1611.0,-1554.0,-1611.0,6264.0,18443.565,...,6264.0,9308.89125,5.0,44.0,11.0,19.75,0.0,0.0,0.0,0.0
1000003,1,1,-94.0,-34.0,-94.0,-108.0,-49.0,-108.0,4951.35,4951.35,...,4951.35,4951.35,14.0,17.0,14.0,15.333333,0.0,0.0,0.0,0.0
1000004,2,1,-862.0,-682.0,-862.0,-881.0,-695.0,-881.0,3391.11,13176.495,...,3391.11,4789.022143,10.0,58.0,19.0,26.714286,0.0,0.0,0.0,0.0
1000005,1,1,-1688.0,-1418.0,-1688.0,-1687.0,-1433.0,-1687.0,14599.26,14713.605,...,14713.605,13365.609545,-3.0,36.0,-1.0,8.454545,0.0,14710.815,0.0,1337.600455


In [26]:
inst_grouped['SK_ID_CURR'] = id_prev_to_curr

inst_idcurr = inst_grouped.groupby('SK_ID_CURR').agg(['min','max','sum','mean'])
inst_idcurr['INST_CNT'] = id_prev_to_curr.value_counts()

inst_idcurr.columns = ['_'.join(x) for x in inst_idcurr.columns.values]
inst_cols = inst_idcurr.columns

del inst_grouped

inst_idcurr.head()

Unnamed: 0_level_0,INST_VERSION_nunique_min,INST_VERSION_nunique_max,INST_VERSION_nunique_sum,INST_VERSION_nunique_mean,INST_NUM_first_min,INST_NUM_first_max,INST_NUM_first_sum,INST_NUM_first_mean,INST_DAY_DUE_min_min,INST_DAY_DUE_min_max,...,INST_PAY_DIFF_max_mean,INST_PAY_DIFF_first_min,INST_PAY_DIFF_first_max,INST_PAY_DIFF_first_sum,INST_PAY_DIFF_first_mean,INST_PAY_DIFF_mean_min,INST_PAY_DIFF_mean_max,INST_PAY_DIFF_mean_sum,INST_PAY_DIFF_mean_mean,INST_CNT_
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1,2,3,1.5,1,2,3,1.5,-2916.0,-1709.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
100002,2,2,2,2.0,1,1,1,1.0,-565.0,-565.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
100003,1,2,4,1.333333,1,1,3,1.0,-2310.0,-716.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
100004,2,2,2,2.0,1,1,1,1.0,-784.0,-784.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
100005,2,2,2,2.0,1,1,1,1.0,-706.0,-706.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [27]:
print('loading train...')
df_train = pd.read_csv('input/application_train.csv',index_col='SK_ID_CURR')

print('merging train...')
df_train = df_train.merge(inst_idcurr,how='left',left_index=True,right_index=True)

if save_files:
    print('saving train...')
    df_train[inst_cols].to_csv('input/app_train_ALL_inst.csv')
    
    print('loading test...')
    df_test = pd.read_csv('input/application_test.csv',index_col='SK_ID_CURR')

    print('merging test...')
    df_test = df_test.merge(inst_idcurr,how='left',left_index=True,right_index=True)
   
    print('saving test...')
    df_test[inst_cols].to_csv('input/app_test_ALL_inst.csv')
    
del inst_idcurr

display(df_train.head())

print('Done!')

loading train...
merging train...
saving train...
loading test...
merging test...
saving test...


Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,INST_PAY_DIFF_max_mean,INST_PAY_DIFF_first_min,INST_PAY_DIFF_first_max,INST_PAY_DIFF_first_sum,INST_PAY_DIFF_first_mean,INST_PAY_DIFF_mean_min,INST_PAY_DIFF_mean_max,INST_PAY_DIFF_mean_sum,INST_PAY_DIFF_mean_mean,INST_CNT_
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,5250.789,0.0,23.13,23.13,4.626,0.0,1744.521923,2257.277637,451.455527,5.0


Done!
