## Credit Card Balance

In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
save_files = True

In [None]:
# load credit card file
df_card = pd.read_csv('input/credit_card_balance.csv')
df_card.info()
df_card.head()

In [None]:
df_card[df_card.SK_ID_PREV==2562384].sort_values(by='MONTHS_BALANCE').head(10)

In [None]:
# relationship between SK_ID_PREV and SK_ID_CURR
id_prev_curr = df_card.groupby('SK_ID_PREV').SK_ID_CURR.first()
display(id_prev_curr.head(5))

# drop SK_ID_CURR from df_card
df_card.drop('SK_ID_CURR',axis=1,inplace=True)

In [None]:
# one categorical variable: NAME_CONTRACT_STATUS
df_card.NAME_CONTRACT_STATUS.value_counts()

In [None]:
status = df_card.groupby('SK_ID_PREV').NAME_CONTRACT_STATUS.value_counts().unstack(fill_value=0)
status[status.Refused==1]

In [None]:
df_card[df_card.SK_ID_PREV==1211141].sort_values(by='MONTHS_BALANCE')[['MONTHS_BALANCE','NAME_CONTRACT_STATUS','AMT_BALANCE']].head(10)

In [None]:
df_card[(df_card.SK_ID_PREV==1211141)].sort_values(by='MONTHS_BALANCE').head(10).T

In [None]:
tmp=df_card[df_card.SK_ID_PREV==1468118].sort_values(by='MONTHS_BALANCE')
plt.plot(tmp.MONTHS_BALANCE,tmp.AMT_BALANCE)
plt.plot(tmp.MONTHS_BALANCE,tmp.AMT_INST_MIN_REGULARITY)
plt.plot(tmp.MONTHS_BALANCE,tmp.AMT_PAYMENT_CURRENT)
plt.plot(tmp.MONTHS_BALANCE,tmp.AMT_DRAWINGS_CURRENT)


tmp['my_diff'] = tmp.AMT_DRAWINGS_CURRENT+tmp.AMT_INST_MIN_REGULARITY-tmp.AMT_PAYMENT_TOTAL_CURRENT
tmp['balance_diff'] = tmp.AMT_BALANCE.diff()

tmp[['AMT_PAYMENT_TOTAL_CURRENT','AMT_INST_MIN_REGULARITY','AMT_DRAWINGS_CURRENT',
     'AMT_BALANCE','balance_diff','NAME_CONTRACT_STATUS']].head(10)

del tmp

In [None]:
df_card['PAYMENT_DIFF'] = df_card.AMT_INST_MIN_REGULARITY - df_card.AMT_PAYMENT_TOTAL_CURRENT

In [None]:
df_card = df_card.sort_values(by=['SK_ID_PREV','MONTHS_BALANCE']).reset_index(drop=True)

df_card = pd.get_dummies(df_card)

In [None]:
df_card.rename(columns={'MONTHS_BALANCE':'MONTH',
                        'AMT_BALANCE':'BALANCE',
                        'AMT_CREDIT_LIMIT_ACTUAL':'LIMIT',
                        'AMT_DRAWINGS_ATM_CURRENT':'AMT_ATM',
                        'AMT_DRAWINGS_CURRENT':'AMT_DRAW',
                        'AMT_DRAWINGS_OTHER_CURRENT':'AMT_OTHER',
                        'AMT_DRAWINGS_POS_CURRENT':'AMT_POS',
                        'AMT_INST_MIN_REGULARITY':'MIN_INST',
                        'AMT_PAYMENT_CURRENT':'PAID',
                        'AMT_PAYMENT_TOTAL_CURRENT':'PAID_TOT',
                        'AMT_RECEIVABLE_PRINCIPAL':'PRINC',
                        'AMT_RECIVABLE':'RECIV',
                        'AMT_TOTAL_RECEIVABLE':'RECIV_TOT',
                        'CNT_DRAWINGS_ATM_CURRENT':'CNT_ATM',
                        'CNT_DRAWINGS_CURRENT':'CNT_DRAW',
                        'CNT_DRAWINGS_OTHER_CURRENT':'CNT_OTHER',
                        'CNT_DRAWINGS_POS_CURRENT':'CNT_POS',
                        'CNT_INSTALMENT_MATURE_CUM':'CNT_INST',
                        'NAME_CONTRACT_STATUS_Active':'STAT_Active',
                        'NAME_CONTRACT_STATUS_Approved':'STAT_Appr',
                        'NAME_CONTRACT_STATUS_Completed':'STAT_Compl',
                        'NAME_CONTRACT_STATUS_Demand':'STAT_Deman',
                        'NAME_CONTRACT_STATUS_Refused':'STAT_Refu',
                        'NAME_CONTRACT_STATUS_Sent proposal':'STAT_Sent',
                        'NAME_CONTRACT_STATUS_Signed':'STAT_Sign'},
               inplace=True)

df_card.head()

In [None]:
card_grouped = df_card.groupby('SK_ID_PREV').agg(['first','last','mean','max','min'])

card_grouped.columns = ['CC_'+'_'.join(x) for x in card_grouped.columns.values]

del df_card

card_grouped.head()

In [None]:
drop_cols = card_grouped.sum()[card_grouped.abs().sum()==0].index.tolist()
print(drop_cols)

card_grouped.drop(drop_cols,axis=1,inplace=True)

In [None]:
card_grouped['SK_ID_CURR'] = id_prev_curr

cc_idcurr = card_grouped.groupby('SK_ID_CURR').agg(['min','max','sum','mean'])
cc_idcurr['CC_count'] = card_grouped.SK_ID_CURR.value_counts()

cc_idcurr.columns = ['_'.join(x) for x in cc_idcurr.columns.values]
cc_cols = cc_idcurr.columns

del card_grouped

cc_idcurr.head()

In [None]:
print('loading train...')
df_train = pd.read_csv('input/application_train.csv',index_col='SK_ID_CURR')

print('merging train...')
df_train = df_train.merge(cc_idcurr,how='left',left_index=True,right_index=True)

if save_files:
    print('saving train...')
    df_train[cc_cols].to_csv('input/app_train_ALL_cc.csv')
    
    print('loading test...')
    df_test = pd.read_csv('input/application_test.csv',index_col='SK_ID_CURR')

    print('merging test...')
    df_test = df_test.merge(cc_idcurr,how='left',left_index=True,right_index=True)

    
    print('saving test...')
    df_test[cc_cols].to_csv('input/app_test_ALL_cc.csv')
    
del cc_idcurr

display(df_train.head())

print('Done!')

## model

In [None]:
def plot_roc_curve(y_true, y_pred, color='darkorange',plot_random=True,label='ROC curve'):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    score = roc_auc_score(y_true, y_pred)
    
    plt.plot(fpr, tpr, color='darkorange',
             lw=2, label=label+' (area = %0.2f)' % score)
    
    if plot_random:
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
target_counts = df_train.TARGET.value_counts()
targ1_weight = target_counts[0]/target_counts[1]
weights = df_train.TARGET.replace({0:1, 1:targ1_weight}).values

dtypes = df_train.dtypes
cols_categ = dtypes[dtypes=='object'].index.tolist()
for col in cols_categ:
    df_train[col] = df_train[col].astype('category')
    
y_true = df_train.TARGET
X = df_train.drop('TARGET',axis=1)
train_data = lgb.Dataset(X, label=y_true, weight=weights)

    
# train
param = {'objective':'binary', 'metric':'auc'}
bst = lgb.train(param, train_data)

#plot roc curve
plt.figure()
plot_roc_curve(y_true, bst.predict(X))

In [None]:
plt.figure(figsize=(10,30))

feat_imp = pd.DataFrame({'split':bst.feature_importance(importance_type='split'),
                         'gain':bst.feature_importance(importance_type='gain')},
                        index=bst.feature_name())

total_gain = feat_imp['gain'].sum()
frac_gain = feat_imp['gain']/total_gain
frac_gain = frac_gain.sort_values(ascending=False)

display(frac_gain.head(30))

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot(np.arange(len(frac_gain))+1, frac_gain)
plt.xlabel('n_features')
plt.ylabel('frac gain')

plt.subplot(1,2,2)
plt.plot(np.arange(len(frac_gain))+1,frac_gain.cumsum())
plt.xlabel('n_features')
plt.ylabel('cumulative frac gain')

In [None]:
# select top n columns from cc by importance
n_cols = 60
select_cols = frac_gain[frac_gain.index.str.contains('CC_')].index[:n_cols]

if save_files:
    df_train[select_cols].to_csv('input/app_train_SELECT_cc.csv')
    df_test[select_cols].to_csv('input/app_test_SELECT_cc.csv')