## Installments

In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
save_files = True

In [None]:
# load installments file
df_inst = pd.read_csv('input/installments_payments.csv')
df_inst.info()
display(df_inst.head())
display(df_inst.describe())

In [None]:
df_inst['DAY_DIFF'] = df_inst['DAYS_INSTALMENT']-df_inst['DAYS_ENTRY_PAYMENT']
df_inst['PAY_DIFF'] = df_inst['AMT_INSTALMENT']-df_inst['AMT_PAYMENT']

df_inst.rename(columns={'NUM_INSTALMENT_VERSION':'VERSION',
                        'NUM_INSTALMENT_NUMBER':'NUM',
                        'DAYS_INSTALMENT':'DAY_DUE',
                        'DAYS_ENTRY_PAYMENT':'DAY_PAY',
                        'AMT_INSTALMENT':'AMT',
                        'AMT_PAYMENT':'PAID'},
               inplace=True)

In [None]:
id_prev_to_curr = df_inst.groupby('SK_ID_PREV').SK_ID_CURR.first()

df_inst.drop('SK_ID_CURR',axis=1,inplace=True)

In [None]:
df_inst.sort_values(by=['SK_ID_PREV','DAY_DUE'],inplace=True)
df_inst.reset_index(inplace=True,drop=True)

df_inst.head()

In [None]:
agg_funcs = {
    'VERSION':['nunique'],
    'NUM':['first'],
    'DAY_DUE':['min','max','first'],
    'DAY_PAY':['min','max','first'],
    'AMT':['min','max','first','mean'],
    'PAID':['min','max','first','mean'],
    'DAY_DIFF':['min','max','first','mean'],
    'PAY_DIFF':['min','max','first','mean']
}

inst_grouped = df_inst.groupby('SK_ID_PREV').agg(agg_funcs)

inst_grouped.columns = ['INST_'+'_'.join(x) for x in inst_grouped.columns.values]

del df_inst

inst_grouped.head()

In [None]:
inst_grouped['SK_ID_CURR'] = id_prev_to_curr

inst_idcurr = inst_grouped.groupby('SK_ID_CURR').agg(['min','max','sum','mean'])
inst_idcurr['INST_CNT'] = id_prev_to_curr.value_counts()

inst_idcurr.columns = ['_'.join(x) for x in inst_idcurr.columns.values]
inst_cols = inst_idcurr.columns

del inst_grouped

inst_idcurr.head()

In [None]:
print('loading train...')
df_train = pd.read_csv('input/application_train.csv',index_col='SK_ID_CURR')

print('merging train...')
df_train = df_train.merge(inst_idcurr,how='left',left_index=True,right_index=True)

if save_files:
    print('saving train...')
    df_train[inst_cols].to_csv('input/app_train_ALL_inst.csv')
    
    print('loading test...')
    df_test = pd.read_csv('input/application_test.csv',index_col='SK_ID_CURR')

    print('merging test...')
    df_test = df_test.merge(inst_idcurr,how='left',left_index=True,right_index=True)
   
    print('saving test...')
    df_test[inst_cols].to_csv('input/app_test_ALL_inst.csv')
    
del inst_idcurr

display(df_train.head())

print('Done!')

## model

In [None]:
def plot_roc_curve(y_true, y_pred, color='darkorange',plot_random=True,label='ROC curve'):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    score = roc_auc_score(y_true, y_pred)
    
    plt.plot(fpr, tpr, color='darkorange',
             lw=2, label=label+' (area = %0.2f)' % score)
    
    if plot_random:
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
target_counts = df_train.TARGET.value_counts()
targ1_weight = target_counts[0]/target_counts[1]
weights = df_train.TARGET.replace({0:1, 1:targ1_weight}).values

dtypes = df_train.dtypes
cols_categ = dtypes[dtypes=='object'].index.tolist()
for col in cols_categ:
    df_train[col] = df_train[col].astype('category')
    
y_true = df_train.TARGET
X = df_train.drop('TARGET',axis=1)
train_data = lgb.Dataset(X, label=y_true, weight=weights)

    
# train
param = {'objective':'binary', 'metric':'auc'}
bst = lgb.train(param, train_data)

#plot roc curve
plt.figure()
plot_roc_curve(y_true, bst.predict(X))

In [None]:
plt.figure(figsize=(10,30))

feat_imp = pd.DataFrame({'split':bst.feature_importance(importance_type='split'),
                         'gain':bst.feature_importance(importance_type='gain')},
                        index=bst.feature_name())

total_gain = feat_imp['gain'].sum()
frac_gain = feat_imp['gain']/total_gain
frac_gain = frac_gain.sort_values(ascending=False)

display(frac_gain.head(30))

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot(np.arange(len(frac_gain))+1, frac_gain)
plt.xlabel('n_features')
plt.ylabel('frac gain')

plt.subplot(1,2,2)
plt.plot(np.arange(len(frac_gain))+1,frac_gain.cumsum())
plt.xlabel('n_features')
plt.ylabel('cumulative frac gain')

In [None]:
# select top n columns from cc by importance
n_cols = 60
select_cols = frac_gain[frac_gain.index.str.contains('INST_')].index[:n_cols]

if save_files:
    df_train[select_cols].to_csv('input/app_train_SELECT_inst.csv')
    df_test[select_cols].to_csv('input/app_test_SELECT_inst.csv')
    
frac_gain[select_cols]