Home Credit strives to broaden financial inclusion for the unbanked population by providing a positive and safe borrowing experience. In order to make sure this underserved population has a positive loan experience, Home Credit makes use of a variety of alternative data--including telco and transactional information--to predict their clients' repayment abilities.

While Home Credit is currently using various statistical and machine learning methods to make these predictions, they're challenging Kagglers to help them unlock the full potential of their data. Doing so will ensure that clients capable of repayment are not rejected and that loans are given with a principal, maturity, and repayment calendar that will empower their clients to be successful.

In [None]:
import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split



from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns
import gc


In [None]:
dirName = None
for dirname, _, filenames in os.walk('/kaggle/input'):
    dirName = dirname
    for filename in filenames:
        print(os.path.join(dirname, filename))
dirName = dirName+"/"

In [None]:
buro_bal = pd.read_csv(dirName+'bureau_balance.csv')
buro_bal.shape

In [None]:
buro_bal = pd.read_csv(dirName + 'bureau_balance.csv')
buro_bal = pd.get_dummies(buro_bal,drop_first=True)
# buro_bal = pd.concat([buro_bal, pd.get_dummies(buro_bal.STATUS, prefix='buro_bal_status')], axis=1).drop('STATUS', axis=1)
    
buro_counts = buro_bal[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').count()
buro_bal['buro_count'] = buro_bal['SK_ID_BUREAU'].map(buro_counts['MONTHS_BALANCE'])
    
buro_bal = buro_bal.groupby('SK_ID_BUREAU').mean()

buro = pd.read_csv(dirName + 'bureau.csv')   

buro_full = buro.merge(right=buro_bal.reset_index(), how='left', on='SK_ID_BUREAU', suffixes=('', '_bur_bal'))

nb_bureau_per_curr = buro_full[['SK_ID_CURR', 'SK_ID_BUREAU']].groupby('SK_ID_CURR').count()
buro_full['SK_ID_BUREAU'] = buro_full['SK_ID_CURR'].map(nb_bureau_per_curr['SK_ID_BUREAU'])

buro_full = buro_full.groupby('SK_ID_CURR').mean()


In [None]:
prev_app = pd.read_csv(dirName+'previous_application.csv')


In [None]:
prev_app = pd.get_dummies(prev_app,drop_first=True)
prev_app_count = prev_app[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
prev_app['SK_ID_PREV'] = prev_app['SK_ID_CURR'].map(prev_app_count['SK_ID_PREV'])
prev_app = prev_app.groupby('SK_ID_CURR').mean()

In [None]:
pos = pd.read_csv(dirName + 'POS_CASH_balance.csv')

In [None]:
pos = pd.get_dummies(pos,drop_first=True)
pos_grp_count = pos[["SK_ID_CURR","SK_ID_PREV"]].groupby("SK_ID_CURR").count()
pos['SK_ID_PREV'] = pos['SK_ID_CURR'].map(pos_grp_count['SK_ID_PREV'])
pos = pos.groupby("SK_ID_CURR").mean()

In [None]:
cc_bal = pd.read_csv(dirName+'credit_card_balance.csv')

In [None]:
cc_bal = pd.get_dummies(cc_bal,drop_first=True)
cc_grp_count = cc_bal[["SK_ID_CURR","SK_ID_PREV"]].groupby("SK_ID_CURR").count()
cc_bal['SK_ID_PREV'] = cc_bal['SK_ID_CURR'].map(cc_grp_count['SK_ID_PREV'])
cc_bal = cc_bal.groupby("SK_ID_CURR").mean()

In [None]:
inst = pd.read_csv(dirName+'installments_payments.csv')

In [None]:
inst = pd.get_dummies(inst,drop_first=True)
inst_grp_count = inst[["SK_ID_CURR","SK_ID_PREV"]].groupby("SK_ID_CURR").count()
inst['SK_ID_PREV'] = inst['SK_ID_CURR'].map(inst_grp_count['SK_ID_PREV'])
inst = inst.groupby("SK_ID_CURR").mean()

In [None]:
train = pd.read_csv(dirName+"application_train.csv")
train = pd.get_dummies(train,drop_first=True)
test = pd.read_csv(dirName+"application_test.csv")
test = pd.get_dummies(test,drop_first=True)

In [None]:
data = pd.concat([train, test], axis=0)

In [None]:
data = data.merge(right=buro_full.reset_index(), how='left', on='SK_ID_CURR')    
data = data.merge(right=prev_app.reset_index(), how='left', on='SK_ID_CURR')    
data = data.merge(right=pos.reset_index(), how='left', on='SK_ID_CURR')    
data = data.merge(right=cc_bal.reset_index(), how='left', on='SK_ID_CURR')
data = data.merge(right=inst.reset_index(), how='left', on='SK_ID_CURR')

In [None]:
data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in data.columns]
train = data[:train.shape[0]]
test = data[train.shape[0]:]

In [None]:
Y = train['TARGET']
train.drop(['TARGET'],inplace=True,axis=1)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train, Y, test_size=0.2, random_state=18)
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_eval = lgb.Dataset(data=x_val, label=y_val)

In [None]:
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)

In [None]:
lgb.plot_importance(model, figsize=(12, 25), max_num_features=100);

In [None]:
preds = model.predict(test)
sub_lgb = pd.DataFrame()
sub_lgb['SK_ID_CURR'] = test['SK_ID_CURR']
sub_lgb['TARGET'] = preds
sub_lgb.to_csv("submission.csv", index=False)
sub_lgb.head()

In [None]:
test.drop("TARGET",axis=1,inplace=True)

In [None]:
test