# Entity Relationship Diagram (ERD)

![](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)

# Libraries

In [6]:
import numpy as np
import pandas as pd
import gc
import time
# https://stackoverflow.com/questions/3693771/trying-to-understand-python-with-statement-and-context-managers
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Helper Functions

## Timer

In [7]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield # runs the with block here
    print('{} - done in {:.0f}s'.format(title, time.time() - t0))

## One-Hot Encoder

In [46]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

## Preprocessing of application train test

In [54]:
def application_train_test(num_rows = 10000, nan_as_category=False):
    # read and union data
    DIR = '~/kaggle-home-credit'
    df = pd.read_csv(DIR + '/input/application_train.csv', nrows=num_rows)
    test_df = pd.read_csv(DIR + '/input/application_test.csv', nrows=num_rows)
    print('Train shape: {}'.format(df.shape))
    print('Test shape: {}'.format(test_df.shape))
    
    # Union of train and test sets
    df = df.append(test_df).reset_index()
    
    # remove rows without gender
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # identify columns of client provided documents
    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    #print(docs)
    
    # identify columns of real estate and personal documents
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]
    
    # replace 365243 as NaN
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    
    # this returns a series of median rather than a dataframe for mapping
    inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
    
    # Feature Engineering
    df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    # can consider sum instead of kurtosis
    df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
    # added 1 to avoid division by zero
    df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    # maps median of median income by organization
    df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
    df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
    df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
    df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    
    # Binary encoding of categorical binary features
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        # returns labels and uniques
        df[bin_feature], uniques = pd.factorize(df[bin_feature])

    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    dropcolum=['FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',
    'FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7',
    'FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 
    'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13',
    'FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16',
    'FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19',
    'FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']
    df= df.drop(dropcolum,axis=1)
    del test_df
    gc.collect()
    
    print('Combined shape: {}'.format(df.shape))
    
    return df


In [55]:
application_train_test()

Train shape: (10000, 122)
Test shape: (10000, 121)
Combined shape: (20000, 238)


Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,0,24700.5,406597.5,351000.0,202500.000,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
1,1,35698.5,1293502.5,1129500.0,270000.000,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,2,6750.0,135000.0,135000.0,67500.000,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,3,29686.5,312682.5,297000.0,135000.000,,,,,,...,0,0,0,0,0,0,0,0,0,1
4,4,21865.5,513000.0,513000.0,121500.000,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
5,5,27517.5,490495.5,454500.0,99000.000,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
6,6,41301.0,1560726.0,1395000.0,171000.000,0.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,1,0,0,0
7,7,42075.0,1530000.0,1530000.0,360000.000,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
8,8,33826.5,1019610.0,913500.0,112500.000,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
9,9,20250.0,405000.0,405000.0,135000.000,,,,,,...,0,0,0,0,0,0,0,1,0,0
