In [None]:
# import common packages
import pandas as pd
import numpy as np
import os as os
import warnings
warnings.filterwarnings("ignore")

# import specific packages
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from sklearn.metrics import roc_auc_score

import lightgbm as gbm
import matplotlib.pyplot as plt


## Load data

First we load all the files saved in the `data` folder.

In [2]:
data_path = './data'

for filename in os.listdir(data_path):
    print(filename)

application_test.csv
HomeCredit_columns_description.csv
POS_CASH_balance.csv
credit_card_balance.csv
installments_payments.csv
application_train.csv
bureau.csv
previous_application.csv
bureau_balance.csv
sample_submission.csv


In [None]:
test = pd.read_csv(os.path.join(data_path, 'application_test.csv'))
col_desc = pd.read_csv(os.path.join(data_path, 'HomeCredit_columns_description.csv'))
pos_cash_balance = pd.read_csv(os.path.join(data_path, 'POS_CASH_balance.csv'))
credit_card_balance = pd.read_csv(os.path.join(data_path, 'credit_card_balance.csv'))
installments_payments = pd.read_csv(os.path.join(data_path, 'installments_payments.csv'))
train = pd.read_csv(os.path.join(data_path, 'application_train.csv'))
bureau = pd.read_csv(os.path.join(data_path, 'bureau.csv'))
prev_application = pd.read_csv(os.path.join(data_path, 'previous_application.csv'))
bureau_balance = pd.read_csv(os.path.join(data_path, 'bureau_balance.csv'))
sample_submission = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))


## Prepare Data

This notebook will do the following:

- Label encoding categorical features with `sklearn.preprocessing.LabelEncoder`
- Handel missing features:
	- There are several strategies to handle the missing features: here we will set the values to some values.
	- We use the `df[num_var].fillna(-999)` here. But there are more options if you use the `sklearn.preprocessing.Imputer`


```from sklearn.preprocessing import Imputer

imputer = Imputer(strategy='median')
num_vars = df.select_dtypes['float64', 'float32', 'int64', 'int32'].columns.tolist()
df[num_vars] = imputer.fit_transform(df[num_vars])
```


In [None]:
lb = LabelEncoder()

def label_encoding_cat(df):
    df = df.copy()
    cat_var = df.select_dtypes('object').columns.tolist()
    for col in cat_var:
        df[col] = lb.fit_transform(df[col].astype('str'))
    return df

def fill_na(df):
    df = df.copy()
    num_var = df.select_dtypes(['float64', 'float32', 'int64', 'int32']).columns.tolist()
    df[num_var] = df[num_var].fillna(-999)
    return df


### Aggregated features

Let's aggregate the numerical features for each *SK_ID_CURR*.

```
df.groupby('SK_ID_CURR').agg(['mean', 'count', 'median', 'max']).reset_index()
```

If only certain columns needs to be aggregated:

```
df.groupby(['col1', ..., 'coln'], as_index=False).mean()
```


In [5]:
label_1 = ['%s_%s'%(s, l) for s in bureau.select_dtypes(['float64', 'float32', 'int64', 'int32']).columns.tolist() if s!='SK_ID_CURR' for l in ['mean', 'count', 'median', 'max']]
agg_bureau = bureau.groupby('SK_ID_CURR').agg(['mean', 'count', 'median', 'max']).reset_index()
agg_bureau.columns = ['SK_ID_CURR'] + label_1

In [6]:
label_2 = ['%s_%s'%(s, l) for s in prev_application.select_dtypes(['float64', 'float32', 'int64', 'int32']).columns.tolist() if s not in ['SK_ID_CURR'] for l in ['mean', 'count', 'median', 'max']]
agg_prev_application = prev_application.groupby('SK_ID_CURR').agg(['mean', 'count', 'median', 'max']).reset_index()
agg_prev_application.columns = ['SK_ID_CURR'] + label_2

In [7]:
# join bureau and bureau_balance to get SK_ID_CURR to bureau_balance
bureau_balance = pd.read_csv(os.path.join(data_path, 'bureau_balance.csv'))
bureau_balance = pd.merge(bureau_balance, bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], how='left', on='SK_ID_BUREAU')

label_3 = ['%s_%s'%(s, l) for s in bureau_balance.select_dtypes(['float64', 'float32', 'int64', 'int32']).columns.tolist() if s not in ['SK_ID_BUREAU'] for l in ['mean', 'count', 'median', 'max']]
agg_bureau_balance = bureau_balance.groupby('SK_ID_CURR').agg(['mean', 'count', 'median', 'max']).reset_index()
agg_bureau_balance.columns = ['SK_ID_CURR'] + label_3


In [8]:
label_4 = ['%s_%s'%(s, l) for s in credit_card_balance.select_dtypes(['float64', 'float32', 'int64', 'int32']).columns.tolist() if s not in ['SK_ID_CURR'] for l in ['mean', 'count', 'median', 'max']]
agg_credit_card_balance = credit_card_balance.groupby('SK_ID_CURR').agg(['mean', 'count', 'median', 'max']).reset_index()
agg_credit_card_balance.columns = ['SK_ID_CURR'] + label_4

In [9]:
label_5 = ['%s_%s'%(s, l) for s in installments_payments.select_dtypes(['float64', 'float32', 'int64', 'int32']).columns.tolist() if s not in ['SK_ID_CURR'] for l in ['mean', 'count', 'median', 'max']]
agg_installments_payments = installments_payments.groupby('SK_ID_CURR').agg(['mean', 'count', 'median', 'max']).reset_index()
agg_installments_payments.columns = ['SK_ID_CURR'] + label_5

In [10]:
label_6 = ['%s_%s'%(s, l) for s in pos_cash_balance.select_dtypes(['float64', 'float32', 'int64', 'int32']).columns.tolist() if s not in ['SK_ID_CURR'] for l in ['mean', 'count', 'median', 'max']]
agg_pos_cash_balance = pos_cash_balance.groupby('SK_ID_CURR').agg(['mean', 'count', 'median', 'max']).reset_index()
agg_pos_cash_balance.columns = ['SK_ID_CURR'] + label_6

In [None]:
del(label_1,label_2,label_3,label_4,label_5,label_6)


First we combine the training set and testing set together, so that we can apply `label_encoding_cat` und `fill_na` afterwards.
We merge the aggregated table to the train & test table on `SK_ID_CURR`.


In [12]:
tr_te = train.drop('TARGET', axis=1).append(test)\
    .pipe(label_encoding_cat)\
    .pipe(fill_na)\
    .merge(agg_bureau, how='left', on='SK_ID_CURR')\
    .merge(agg_credit_card_balance, how='left', on='SK_ID_CURR')\
    .merge(agg_pos_cash_balance, how='left', on='SK_ID_CURR')\
    .merge(agg_prev_application, how='left', on='SK_ID_CURR')

In [13]:
tri = train.shape[0]
y = train.TARGET.copy()

In [None]:
# Create the training and testing df.
tr_te.drop('SK_ID_CURR', axis=1, inplace=True)
tr = fill_na(tr_te).iloc[:tri, :].copy()
te = fill_na(tr_te).iloc[tri:, :].copy()


## Train model: Light Gradient Boosting Machine

I added up [LightGBM](https://lightgbm.readthedocs.io/en/latest/Python-Intro.html) as that I noticed on Kaggle this would achieve higher AUC_ROC score (about 0.78) compared to RandomForestClassifier (auc_roc at about 0.71). 

Before going on to training, let's explain the steps of training with LightGBM.

**Load data**

```
train_data = gmb.Dataset(df)
# OR 
train_data = gmb.Dataset(data, label=label)
```

**Booster parameters**

```
param = {'num_leaves':31, 'num_trees':100, 'objective':'binary'}

```

**Metric parameters**

Here we can set it to `auc`.
```
param['metric'] = 'auc'
```

**



In [15]:
Dparam = {'objective' : 'binary',
          'boosting_type': 'gbdt',
          'metric' : 'auc',
          'nthread' : 4,
          'shrinkage_rate':0.025,
          'max_depth':8,
          'min_data_in_leaf':100,
          'min_child_weight': 2,
          'bagging_fraction':0.75,
          'feature_fraction':0.75,
          'min_split_gain':.01,
          'lambda_l1':1,
          'lambda_l2':1,
          'num_leaves':36} 

In [16]:
folds = KFold(n_splits=5, shuffle=True, random_state=123456)

In [17]:
off_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

In [20]:
feature_importance_df = pd.DataFrame()
feats = [f for f in train.columns if f not in ['SK_ID_CURR']]

[1000]	train's auc: 0.872276	val's auc: 0.782073


[2000]	train's auc: 0.925911	val's auc: 0.781464


[3000]	train's auc: 0.957165	val's auc: 0.780306


Fold  1 AUC: 0.780306


[1000]	train's auc: 0.874167	val's auc: 0.783202


[2000]	train's auc: 0.926376	val's auc: 0.782785


[3000]	train's auc: 0.956695	val's auc: 0.780828


Fold  2 AUC: 0.780828


[1000]	train's auc: 0.872797	val's auc: 0.782933


[2000]	train's auc: 0.924618	val's auc: 0.782177


[3000]	train's auc: 0.955913	val's auc: 0.780925


Fold  3 AUC: 0.780925


[1000]	train's auc: 0.87251	val's auc: 0.780649


[2000]	train's auc: 0.925375	val's auc: 0.78074


[3000]	train's auc: 0.956294	val's auc: 0.779591


Fold  4 AUC: 0.779591


[1000]	train's auc: 0.873406	val's auc: 0.779406


[2000]	train's auc: 0.925119	val's auc: 0.778319


[3000]	train's auc: 0.956101	val's auc: 0.777131


Fold  5 AUC: 0.777131


In [22]:
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(tr)):
    dtrain = gbm.Dataset(tr.iloc[trn_idx], y.iloc[trn_idx])
    dval = gbm.Dataset(tr.iloc[val_idx], y.iloc[val_idx])
    m_gbm = gbm.train(params=Dparam, train_set=dtrain, num_boost_round=3000, verbose_eval=1000, valid_sets=[dtrain, dval],valid_names=['train','val'])
    off_preds[val_idx] = m_gbm.predict(tr.iloc[val_idx])
    sub_preds += m_gbm.predict(te) / folds.n_splits
    fold_importance_df = pd.DataFrame()
    fold_importance_df['feature'] = feats
    fold_importance_df['fold'] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d AUC: %.6f' % (n_fold+1, roc_auc_score(y.iloc[val_idx], off_preds[val_idx])))

[1000]	train's auc: 0.872276	val's auc: 0.782073


[2000]	train's auc: 0.925911	val's auc: 0.781464


[3000]	train's auc: 0.957165	val's auc: 0.780306


Fold  1 AUC: 0.780306


[1000]	train's auc: 0.874167	val's auc: 0.783202


[2000]	train's auc: 0.926376	val's auc: 0.782785


[3000]	train's auc: 0.956695	val's auc: 0.780828


Fold  2 AUC: 0.780828


[1000]	train's auc: 0.872797	val's auc: 0.782933


[2000]	train's auc: 0.924618	val's auc: 0.782177


[3000]	train's auc: 0.955913	val's auc: 0.780925


Fold  3 AUC: 0.780925


[1000]	train's auc: 0.87251	val's auc: 0.780649


[2000]	train's auc: 0.925375	val's auc: 0.78074


[3000]	train's auc: 0.956294	val's auc: 0.779591


Fold  4 AUC: 0.779591


[1000]	train's auc: 0.873406	val's auc: 0.779406


[2000]	train's auc: 0.925119	val's auc: 0.778319


[3000]	train's auc: 0.956101	val's auc: 0.777131


Fold  5 AUC: 0.777131


The auc on training set is higher than it on validation set, so we can see that the model is overfitting the training data. There is still room to improve its performance on validation set.

##Train model: Random Forest Classifier

In [27]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(tr)):
    clone_clf = clone(rf_clf)
    clone_clf.fit(tr.iloc[trn_idx], y.iloc[trn_idx])
    train_proba = clone_clf.predict_proba(tr.iloc[trn_idx])
    val_proba = clone_clf.predict_proba(tr.iloc[val_idx])
    trn_roc_auc = roc_auc_score(y.iloc[trn_idx], train_proba[:,1])
    val_roc_auc = roc_auc_score(y.iloc[val_idx], val_proba[:,1])
    print('%d Fold: train AUC %.6f, val AUC %.6f' % (n_fold+1, trn_roc_auc, val_roc_auc))

1 Fold: train AUC 1.000000, val AUC 0.711845


2 Fold: train AUC 1.000000, val AUC 0.713323


3 Fold: train AUC 1.000000, val AUC 0.713551


4 Fold: train AUC 1.000000, val AUC 0.712408


5 Fold: train AUC 1.000000, val AUC 0.715944


The auc on training set is higher than it on validation set, so we can see that the model is overfitting the training data. There is still room to improve its performance on validation set.