# Home Credit Default Risk (HCDR)

# A Functionized Approach

In [2]:
import gc
from sklearn.externals import joblib
import math
import numpy as np
import os
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, roc_auc_score
from statistics import mean
from sklearn.preprocessing import PolynomialFeatures
import timeit
import uuid

config_file = os.path.join(os.getcwd(), "workingdir.config")
with open(config_file, 'r') as f:
    WORKING_DIR = f.readline()
DATA_DIR = os.path.join(WORKING_DIR, 'Data')
LOG_DIR = os.path.join(WORKING_DIR, 'runs')

def load_data(name):
    in_path = os.path.join(DATA_DIR, f'{name}.csv')
    df = pd.read_csv(in_path)
    return df

def get_datasets(phase):
    datasets = {}
    datasets["application"] = load_data(f"application_{phase}")
    print(f"loaded {len(datasets['application'])} records")
    ds_names = ("bureau","bureau_balance","credit_card_balance","installments_payments",
            "previous_application","POS_CASH_balance")
    for ds_name in ds_names:
        datasets[ds_name] = load_data(ds_name)
    return datasets

def features_from_previous_application(X, datasets):
    '''
    creates the no_prev_appl and no_approved_prev_appl columns
    '''
    prev_app = datasets['previous_application']
    no_app_customer = prev_app.loc[(prev_app.DAYS_DECISION >= -365) & 
           (prev_app.NAME_CONTRACT_STATUS != 'Canceled'), ].groupby('SK_ID_CURR').SK_ID_PREV.count().sort_values(ascending=False) # only select the applications in the past 12 months
    no_app_customer = no_app_customer.to_frame()
    no_app_customer.reset_index(inplace=True)
    no_app_customer.rename(columns={'SK_ID_PREV':'no_prev_appl'}, inplace=True)
    
    no_approved_app_customer = prev_app.loc[(prev_app.DAYS_DECISION >= -365) & 
           (prev_app.NAME_CONTRACT_STATUS == 'Approved'), ].groupby('SK_ID_CURR').SK_ID_PREV.count().sort_values(ascending=False) # only select the applications in the past 12 months
    no_approved_app_customer = no_approved_app_customer.to_frame()
    no_approved_app_customer.reset_index(inplace=True)
    no_approved_app_customer.rename(columns={'SK_ID_PREV':'no_approved_prev_appl'}, inplace=True)
    
    X = pd.merge(X, no_app_customer, on='SK_ID_CURR', how='left')
    X = pd.merge(X, no_approved_app_customer, on='SK_ID_CURR', how='left')
    X[['no_prev_appl']] = X[['no_prev_appl']].fillna(value=0)
    X[['no_approved_prev_appl']] = X[['no_approved_prev_appl']].fillna(0)
    
    return X

def features_from_bureau(X, datasets):
    '''
    Engineered features:
    - total_creditLimit
    - no_of_loans 
    - ave_creditLimit
    '''
    bureau = datasets['bureau']
    credit_sum = bureau[(bureau.CREDIT_ACTIVE == 'Active') & (bureau.CREDIT_CURRENCY == 'currency 1')] \
        .groupby('SK_ID_CURR').AMT_CREDIT_SUM.sum()
    credit_sum = credit_sum.to_frame()
    credit_sum.reset_index(inplace=True)
    
    no_loans = bureau[(bureau.CREDIT_ACTIVE == 'Active') & (bureau.CREDIT_CURRENCY == 'currency 1')] \
        .groupby('SK_ID_CURR').SK_ID_BUREAU.count()
    no_loans = no_loans.to_frame()
    no_loans.reset_index(inplace=True)
    
    bureau_info = pd.merge(credit_sum, no_loans, how='outer', on='SK_ID_CURR')
    bureau_info.rename(columns={'AMT_CREDIT_SUM':'total_creditLimit', 'SK_ID_BUREAU':'no_of_loans'}, inplace=True)
    bureau_info['ave_creditLimit'] = bureau_info.eval('total_creditLimit/no_of_loans')

    X = pd.merge(X, bureau_info, on='SK_ID_CURR', how='left')
    X[['total_creditLimit', 'no_of_loans', 'ave_creditLimit']] = X[['total_creditLimit', 'no_of_loans', 'ave_creditLimit']].fillna(0)

    return X

def features_from_credit_card_balance(X, datasets):
    '''
    Engineered features:
    - utilization_CC
    - payment_ratio_CC
    - total_credit_limit_CC
    '''
    ccb = datasets['credit_card_balance']
    creditCard_info = ccb[(ccb.MONTHS_BALANCE >= -24) & 
                          (ccb.NAME_CONTRACT_STATUS =='Active') &
                          (ccb.AMT_CREDIT_LIMIT_ACTUAL > 0)] \
                    .groupby('SK_ID_CURR')['AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_PAYMENT_TOTAL_CURRENT'].sum()
    creditCard_info.reset_index(inplace=True)
    
    creditCard_info['utilization_CC'] = creditCard_info.eval('AMT_BALANCE/AMT_CREDIT_LIMIT_ACTUAL')
    creditCard_info['payment_ratio_CC'] = creditCard_info.eval('AMT_PAYMENT_TOTAL_CURRENT/AMT_BALANCE')
    creditCard_info.loc[creditCard_info.payment_ratio_CC.isnull(), 'payment_ratio_CC'] = 1
    creditCard_info.loc[creditCard_info.payment_ratio_CC > 1, 'payment_ratio_CC'] = 1
    creditCard_info.drop(columns=['AMT_BALANCE', 'AMT_PAYMENT_TOTAL_CURRENT'], inplace=True)
    creditCard_info.rename(columns={'AMT_CREDIT_LIMIT_ACTUAL':'total_creditLimit_CC'}, inplace=True)
    
    X = pd.merge(X, creditCard_info, on='SK_ID_CURR', how='left')
    return X # best fillna strategy will be left to grid search

def features_from_installments_payments(X, datasets):
    '''
    Engineered features:
    - past_due_times
    '''
    ip = datasets['installments_payments']
    ip['past_due_times'] = (ip.eval('DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT') > 0).astype(int)
    past_due_info = ip[ip['DAYS_INSTALMENT'] >= -730].groupby('SK_ID_CURR')['past_due_times'].sum() 
    past_due_info = past_due_info.to_frame()
    past_due_info.reset_index(inplace=True)

    X = pd.merge(X, past_due_info, on='SK_ID_CURR', how='left')
    return X

def features_from_application(X):
    '''
    Engineered features:
    - credit_income_ratio
    - annuity_income_ratio
    - REGION_POPULATION_RELATIVE_flag
    '''
    X['credit_income_ratio'] = X.eval('AMT_CREDIT/AMT_INCOME_TOTAL') #credit to income ratio
    X['annuity_income_ratio'] = X.eval('AMT_ANNUITY/AMT_INCOME_TOTAL') #annuity to income ratio
    return X

def build_ratio_features(X):
    '''
    take ratios of all 2-way combo of numerical features
    '''
    
    # the following features come from Naimesh and Nishad @thank you
    X['income_credit_percen'] = (
        X.AMT_INCOME_TOTAL / X.AMT_CREDIT).replace(np.inf, 0)
    X['fam_member_income'] = (
        X.AMT_INCOME_TOTAL / X.CNT_FAM_MEMBERS).replace(np.inf, 0)
    X['ann_incom_percen'] = (
        X.AMT_ANNUITY / X.AMT_INCOME_TOTAL).replace(np.inf, 0)
    X['new_employ_to_birth_ratio'] = (
        X.DAYS_EMPLOYED / X.DAYS_BIRTH).replace(np.inf, 0)
    X['new_credit_to_annuity'] = (
        X['AMT_CREDIT'] / X['AMT_ANNUITY']).replace(np.inf, 0)
    X['new_credit_to_goods_ratio'] = (
        X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']).replace(np.inf, 0)
    X['new_car_to_birth_ratio'] = (
        X['OWN_CAR_AGE'] / X['DAYS_BIRTH']).replace(np.inf, 0)
    X['new_car_to_emp_ratio'] = (
        X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']).replace(np.inf, 0)
    X['new_inc_per_child'] = (
        X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])).replace(np.inf, 0)
          
    return X
          
def build_features(datasets):
    print("features from previous application")
    X = features_from_previous_application(datasets['application'], datasets)
    print("features from bureau")
    X = features_from_bureau(X, datasets)
    print("features from credit card balance")
    X = features_from_credit_card_balance(X, datasets)
    print("features from installments")
    X = features_from_installments_payments(X, datasets)
    print("features from application")
    X = features_from_application(X)
    
    #create new features are the ratio of two numerical features
    X = build_ratio_features(X)
          
    return X

          
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
def print_msg(X, **kwargs):
    for key, value in kwargs.items():
        print(value)
    return X

def get_standard_pipeline(cat_attribs, num_attribs, poly_degree=1):  
#     import pdb; pdb.set_trace()
    num_pipeline = Pipeline([
        ('num_selector', DataFrameSelector(num_attribs)),
        ('print_num_1', FunctionTransformer(print_msg, kw_args=dict(msg="num_selector"), validate=False)),
        ('num_imputer', SimpleImputer(strategy='mean')),
        ('print_num_2', FunctionTransformer(print_msg, kw_args=dict(msg="num_imputer"), validate=False)),
        ('std_scaler', StandardScaler()),
        ('print_num_3', FunctionTransformer(print_msg, kw_args=dict(msg="num_scaler"), validate=False)), 
        ('polynomial', PolynomialFeatures(degree=poly_degree)),
        ('print_num_done', FunctionTransformer(print_msg, kw_args=dict(msg="num_done"), validate=False)),       
    ])
#     import pdb; pdb.set_trace()
    cat_pipeline = Pipeline([
        ('cat_selector', DataFrameSelector(cat_attribs)),
        ('print_cat_1', FunctionTransformer(print_msg, kw_args=dict(msg="cat_selector"), validate=False)),
        ('cat_imputer', SimpleImputer(strategy='constant', fill_value = 'N/A')),
        ('print_cat_2', FunctionTransformer(print_msg, kw_args=dict(msg="cat_imputer"), validate=False)),
        ('ohe', OneHotEncoder(sparse=False, dtype=np.uint8, handle_unknown="ignore")),
        ('print_cat_done', FunctionTransformer(print_msg, kw_args=dict(msg="cat_done"), validate=False))
    ])
    num_cat_pipeline = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_attribs),
            ('cat', cat_pipeline, cat_attribs)],
        n_jobs = -1
    )
#    num_cat_pipeline = FeatureUnion(transformer_list = [
#        ("num_pipe", num_pipeline),
#        ("cat_pipe", cat_pipeline)
#    ],
#    n_jobs = 2
#    )
    return num_cat_pipeline

# default_pipeline = get_standard_pipeline(cat_attribs, num_attribs)

def pre_process(preproc_pipeline, phase = "train"):
    '''
    Performs all feature engineering, data munging, etc. in a standardized way    
    Returns: (transformed) X, Y, (fitted) pipeline. If phase == "test", Y and pipeline will = None
    '''
    datasets = get_datasets(phase)
    
    if phase == "train":
        Y = datasets['application']['TARGET']
    else:
        Y = None
    
    X = build_features(datasets)  

    print("start pipeline")
#     import pdb; pdb.set_trace()
    if phase == "train":
#         import pdb; pdb.set_trace()
        X = preproc_pipeline.fit_transform(X)
    else:
        X = preproc_pipeline.transform(X)
    
    return X, Y, preproc_pipeline

def get_search_class(search_repr):
    return search_repr[:search_repr.find('(')]

def get_clf(search_repr):
    e_string = 'estimator='
    start = search_repr.find(e_string) + len(e_string)
    end = search_repr.find('),', start)
    return search_repr[start:end+1]
    
def get_params(search_repr, is_grid = True):
    if is_grid:
        arg_string = 'param_grid='
    else:
        arg_string = 'param_distributions='
    start = search_repr.find(arg_string) + len(arg_string)
    end = search_repr.find('},', start)
    return search_repr[start:end+1]

def run_test(X, Y, 
             search, 
             test_description, 
             experiment_name, 
             pipeline_named_steps,
             cat_attribs,
             num_attribs,
             testSize = 0.1, 
             **fit_params):
    '''
    Uses grid search to search for best model params
    NOTE: set early stopping on the estimator you pass in!
    Logs results, including best model (pickled), to mlflow
    
    Arguments:
    `X` - training data - will be split into train and test sets
    `Y` - targets
    `search`- an instance of GridSearchCV or RandomizedSearchCV
    `test_description` - description of test run to be logged
    `experiment_name` - name of experiment that this test run is a part of
    `pipeline_named_steps` - to be logged alongside metrics for future reference
    `cat_attribs` - to be logged alongside metrics for future reference
    `num_attribs` - to be logged alongside metrics for future reference
    `testSize` - fraction of training set to hold out for sanity check/test 
    `fit_params` - additional parameters to be passed to the search .fit() function
    '''
    # train/test split 
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=testSize, random_state=42)
    
    # start log
    search_repr = repr(search)
    search_class = get_search_class(search_repr)
    clf_repr = get_clf(search_repr)
    params = get_params(search_repr)

    run_attribs = []
    run_attribs.append(wrap_in_quotes(experiment_name))   
    run_attribs.append(wrap_in_quotes(test_description))
    run_attribs.append(wrap_in_quotes(str(cat_attribs)))
    run_attribs.append(wrap_in_quotes(str(num_attribs)))
    run_attribs.append(wrap_in_quotes(pipeline_named_steps))
    run_attribs.append(wrap_in_quotes(search_class))
    run_attribs.append(wrap_in_quotes(params))
    
#     import pdb; pdb.set_trace()
    search.fit(X_train, y_train, **fit_params)
    cv_results = search.cv_results_

    # TODO in phase 3: calculate p-value w.r.t. baseline 

    # sanity check with test set
    verification = VerificationTest(search.best_estimator_)
    verification.run(X_test, y_test)

    # log: best params, auc, p-value, array of train loss, array of val loss
    run_attribs.append(wrap_in_quotes(str(search.best_params_)))
    run_attribs.append(format_num(mean(cv_results['mean_fit_time'])))
    run_attribs.append(format_num(search.best_score_))
    run_attribs.append(format_num(verification.test_auc))
    run_attribs.append(format_num(verification.test_accuracy))
    run_attribs.append(format_num(verification.prediction_ms_per_row))
    run_id = str(uuid.uuid4())
    run_attribs.append(wrap_in_quotes(run_id))
    line = ','.join(run_attribs) + '\n'
    
    with open(os.path.join(LOG_DIR, "runs.csv"), 'a') as run_file:
        run_file.write(line)    
        
    # store best model
    model_file = run_id + '.joblib'
    joblib.dump(search.best_estimator_, os.path.join(LOG_DIR, model_file))

class VerificationTest:
    '''
    Runs a verification test against held out test set. Calculates the following metrics,
    which are available as properties:
    * test_auc
    * test_accuracy
    * prediction_ms_per_row
    '''
    def __init__(self, model):
        '''
        `model` = best_estimator_ from search
        `significance_score` = Which metric used for p_value? Choose from ('auc', 'accuracy')
        '''
        self._model = model
        
    def run(self, X_test, y_test):
        '''
        `X_test`
        `y_test`
        '''
        y_hat = self._model.predict(X_test)
        self.test_accuracy = accuracy_score(y_test, y_hat)
        
        y_proba = self._model.predict_proba(X_test)
        self.test_auc = roc_auc_score(y_test, y_proba[:,1])
        
        timing_test = wrapper(self._model.predict, X_test)
        test_execution_time = timeit.timeit(timing_test, setup = gc.enable, number = 1)
        self.prediction_ms_per_row = test_execution_time * 1000000 / len(y_test)
                
def wrapper(func, *args, **kwargs):
    def wrapped():
        return func(*args, **kwargs)
    return wrapped
        
def wrap_in_quotes(s):
    s = s.replace('"', '').replace('\n', '').replace('\r', '')
    return '"' + s + '"'

def format_num(n):
    return str(round(n, 5))

In [3]:
# select numerical and categorical features

NUMERIC_COLS_logit = ['CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'CNT_FAM_MEMBERS',
 'HOUR_APPR_PROCESS_START',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'no_prev_appl',
 'no_approved_prev_appl',
 'total_creditLimit',
 'no_of_loans',
 'ave_creditLimit',
 'total_creditLimit_CC',
 'utilization_CC',
 'payment_ratio_CC',
 'past_due_times',
 'credit_income_ratio',
 'annuity_income_ratio']

CATEGORY_COLS_logit = ['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'OCCUPATION_TYPE',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'WEEKDAY_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21']

# Neural Network with Keras

In [4]:
from keras import losses, metrics, optimizers
from keras.callbacks import EarlyStopping, ProgbarLogger
from keras.models import Sequential
from keras.layers import Activation, AlphaDropout, Dense, Dropout, Input 
from keras.wrappers.scikit_learn import KerasClassifier

def get_keras_model(architecture = [298, 128, 128], 
                    activation_fn = 'relu', 
                    drop_fraction = None):
    model = Sequential()
    
    # LAYERS
    # input
    model.add(Dense(architecture[1], input_shape = (architecture[0],), activation = activation_fn)) 
    if drop_fraction:
        model.add(Dropout(drop_fraction))
    
    # features/weights
    for i in range(2, len(architecture)):
        model.add(Dense(architecture[i], activation = activation_fn))
        if drop_fraction:
            model.add(Dropout(drop_fraction))
            
    # output
    model.add(Dense(1, activation = 'sigmoid'))
    
    # other hyper-parameters
    opt = optimizers.Adadelta()
    loss_fn = losses.binary_crossentropy
    mets = [metrics.binary_accuracy]
    
    # compile and return
    model.compile(optimizer = opt, loss = loss_fn, metrics = mets)
    return model

def get_selu_activation_model(architecture = [298, 256, 256], 
                    drop_fraction = None):
    '''
    Cannot use get_keras_model because we need different algorithms for weight initialization and dropout
    '''
    activation_fn = 'selu'
    wt_init = 'lecun_normal'
    model = Sequential()
    
    # LAYERS
    # input
    model.add(Dense(architecture[1], 
                    input_shape = (architecture[0],), 
                    activation = activation_fn, 
                    kernel_initializer = wt_init))
    if drop_fraction:
        model.add(AlphaDropout(drop_fraction))
    
    # features/weights
    for i in range(2, len(architecture)):
        model.add(Dense(architecture[i], activation = activation_fn, kernel_initializer = wt_init))
        if drop_fraction:
            model.add(AlphaDropout(drop_fraction))
            
    # output
    model.add(Dense(1, activation = 'sigmoid'))
    
    # other hyper-parameters
    opt = optimizers.Adadelta()
    loss_fn = losses.binary_crossentropy
    mets = [metrics.binary_accuracy]
    
    # compile and return
    model.compile(optimizer = opt, loss = loss_fn, metrics = mets)
    return model
    
def get_pipe_named_steps(pipeline):
    named_steps = ''
    for pipe in pipeline.transformers_[:-1]:
        named_steps += str(pipe[1].named_steps)
    named_steps += str(pipeline.transformers_[-1])
    return named_steps


Using TensorFlow backend.


## Neural Net Baseline

In [37]:
early_stopping = EarlyStopping(patience = 1, restore_best_weights = True)
clf = KerasClassifier(build_fn = get_keras_model, 
                      batch_size = 10240, 
                      validation_split = 0.1, 
                      epochs = 20, 
                      verbose = 2)
search_grid = {'drop_fraction':[None, 0.25]}
gs = GridSearchCV(
    estimator = clf,
    param_grid = search_grid,
    scoring = 'roc_auc',  
    cv = 3,
    verbose=10,
    fit_params = {'callbacks': [early_stopping]}
)

# run test
run_test(X, Y, 
         search = gs, 
         experiment_name = "NN", 
         test_description = 'initial NN with 128-128 and dropout of None or 0.25',
         pipeline_named_steps = get_pipe_named_steps(lgbm_pipeline), 
         num_attribs = NUMERIC_COLS_lgbm, 
         cat_attribs = CATEGORY_COLS_lgbm)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] drop_fraction=None ..............................................
Instructions for updating:
Use tf.cast instead.
Train on 166055 samples, validate on 18451 samples
Epoch 1/20
 - 1s - loss: 0.3585 - binary_accuracy: 0.8560 - val_loss: 0.2606 - val_binary_accuracy: 0.9219
Epoch 2/20
 - 1s - loss: 0.2640 - binary_accuracy: 0.9187 - val_loss: 0.2506 - val_binary_accuracy: 0.9219
Epoch 3/20
 - 1s - loss: 0.2574 - binary_accuracy: 0.9187 - val_loss: 0.2466 - val_binary_accuracy: 0.9219
Epoch 4/20
 - 1s - loss: 0.2544 - binary_accuracy: 0.9187 - val_loss: 0.2463 - val_binary_accuracy: 0.9216
Epoch 5/20
 - 1s - loss: 0.2529 - binary_accuracy: 0.9187 - val_loss: 0.2436 - val_binary_accuracy: 0.9216
Epoch 6/20
 - 1s - loss: 0.2520 - binary_accuracy: 0.9187 - val_loss: 0.2429 - val_binary_accuracy: 0.9215
Epoch 7/20
 - 1s - loss: 0.2513 - binary_accuracy: 0.9188 - val_loss: 0.2430 - val_binary_accuracy: 0.9216
[CV] ..... drop_fracti

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/20
 - 1s - loss: 0.3211 - binary_accuracy: 0.8931 - val_loss: 0.2581 - val_binary_accuracy: 0.9219
Epoch 2/20
 - 1s - loss: 0.2594 - binary_accuracy: 0.9196 - val_loss: 0.2481 - val_binary_accuracy: 0.9219
Epoch 3/20
 - 1s - loss: 0.2531 - binary_accuracy: 0.9196 - val_loss: 0.2444 - val_binary_accuracy: 0.9219
Epoch 4/20
 - 1s - loss: 0.2507 - binary_accuracy: 0.9196 - val_loss: 0.2431 - val_binary_accuracy: 0.9218
Epoch 5/20
 - 1s - loss: 0.2496 - binary_accuracy: 0.9197 - val_loss: 0.2419 - val_binary_accuracy: 0.9218
Epoch 6/20
 - 1s - loss: 0.2490 - binary_accuracy: 0.9197 - val_loss: 0.2415 - val_binary_accuracy: 0.9218
Epoch 7/20
 - 1s - loss: 0.2487 - binary_accuracy: 0.9197 - val_loss: 0.2413 - val_binary_accuracy: 0.9217
Epoch 8/20
 - 1s - loss: 0.2479 - binary_accuracy: 0.9197 - val_loss: 0.2430 - val_binary_accuracy: 0.9219


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   16.1s remaining:    0.0s


[CV] ...... drop_fraction=None, score=0.745481301743778, total=   7.9s
[CV] drop_fraction=None ..............................................
Train on 166055 samples, validate on 18451 samples
Epoch 1/20
 - 1s - loss: 0.3727 - binary_accuracy: 0.8523 - val_loss: 0.2661 - val_binary_accuracy: 0.9196
Epoch 2/20
 - 1s - loss: 0.2617 - binary_accuracy: 0.9193 - val_loss: 0.2561 - val_binary_accuracy: 0.9196
Epoch 3/20
 - 1s - loss: 0.2550 - binary_accuracy: 0.9193 - val_loss: 0.2528 - val_binary_accuracy: 0.9196
Epoch 4/20
 - 1s - loss: 0.2525 - binary_accuracy: 0.9193 - val_loss: 0.2514 - val_binary_accuracy: 0.9196
Epoch 5/20
 - 1s - loss: 0.2511 - binary_accuracy: 0.9194 - val_loss: 0.2511 - val_binary_accuracy: 0.9198
Epoch 6/20
 - 1s - loss: 0.2504 - binary_accuracy: 0.9194 - val_loss: 0.2504 - val_binary_accuracy: 0.9196
Epoch 7/20
 - 1s - loss: 0.2496 - binary_accuracy: 0.9194 - val_loss: 0.2497 - val_binary_accuracy: 0.9197
Epoch 8/20
 - 1s - loss: 0.2488 - binary_accuracy: 0.9194 

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   24.8s remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/20
 - 2s - loss: 0.3339 - binary_accuracy: 0.8896 - val_loss: 0.2593 - val_binary_accuracy: 0.9219
Epoch 2/20
 - 1s - loss: 0.2729 - binary_accuracy: 0.9186 - val_loss: 0.2508 - val_binary_accuracy: 0.9219
Epoch 3/20
 - 1s - loss: 0.2670 - binary_accuracy: 0.9185 - val_loss: 0.2476 - val_binary_accuracy: 0.9219
Epoch 4/20
 - 1s - loss: 0.2642 - binary_accuracy: 0.9185 - val_loss: 0.2457 - val_binary_accuracy: 0.9218
Epoch 5/20
 - 1s - loss: 0.2619 - binary_accuracy: 0.9186 - val_loss: 0.2446 - val_binary_accuracy: 0.9218
Epoch 6/20
 - 1s - loss: 0.2607 - binary_accuracy: 0.9186 - val_loss: 0.2437 - val_binary_accuracy: 0.9218
Epoch 7/20
 - 1s - loss: 0.2594 - binary_accuracy: 0.9187 - val_loss: 0.2429 - val_binary_accuracy: 0.9218
Epoch 8/20
 - 1s - loss: 0.2576 - binary_accuracy: 0.9187 - val_loss: 0.2426 - val_binary_accuracy: 0.9218
Epoch 9/20
 - 1s - loss: 0.2567 - binary_accuracy: 0.9187 - val_loss: 0.2420 - val_binary_acc

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   38.3s remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/20
 - 2s - loss: 0.3221 - binary_accuracy: 0.9056 - val_loss: 0.2606 - val_binary_accuracy: 0.9219
Epoch 2/20
 - 1s - loss: 0.2727 - binary_accuracy: 0.9196 - val_loss: 0.2496 - val_binary_accuracy: 0.9219
Epoch 3/20
 - 1s - loss: 0.2651 - binary_accuracy: 0.9194 - val_loss: 0.2455 - val_binary_accuracy: 0.9219
Epoch 4/20
 - 1s - loss: 0.2616 - binary_accuracy: 0.9195 - val_loss: 0.2441 - val_binary_accuracy: 0.9219
Epoch 5/20
 - 1s - loss: 0.2595 - binary_accuracy: 0.9195 - val_loss: 0.2452 - val_binary_accuracy: 0.9219
[CV] ..... drop_fraction=0.25, score=0.7356270097035199, total=   7.2s
[CV] drop_fraction=0.25 ..............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   46.3s remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/20
 - 2s - loss: 0.3583 - binary_accuracy: 0.8693 - val_loss: 0.2649 - val_binary_accuracy: 0.9196
Epoch 2/20
 - 1s - loss: 0.2741 - binary_accuracy: 0.9191 - val_loss: 0.2566 - val_binary_accuracy: 0.9196
Epoch 3/20
 - 1s - loss: 0.2667 - binary_accuracy: 0.9191 - val_loss: 0.2533 - val_binary_accuracy: 0.9196
Epoch 4/20
 - 1s - loss: 0.2633 - binary_accuracy: 0.9190 - val_loss: 0.2525 - val_binary_accuracy: 0.9196
Epoch 5/20
 - 1s - loss: 0.2608 - binary_accuracy: 0.9190 - val_loss: 0.2510 - val_binary_accuracy: 0.9197
Epoch 6/20
 - 1s - loss: 0.2591 - binary_accuracy: 0.9191 - val_loss: 0.2502 - val_binary_accuracy: 0.9196
Epoch 7/20
 - 1s - loss: 0.2580 - binary_accuracy: 0.9193 - val_loss: 0.2501 - val_binary_accuracy: 0.9196
Epoch 8/20
 - 1s - loss: 0.2579 - binary_accuracy: 0.9191 - val_loss: 0.2503 - val_binary_accuracy: 0.9196
[CV] ..... drop_fraction=0.25, score=0.7440576105714417, total=  11.0s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   58.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   58.0s finished


Train on 249083 samples, validate on 27676 samples
Epoch 1/20
 - 2s - loss: 0.3317 - binary_accuracy: 0.8747 - val_loss: 0.2520 - val_binary_accuracy: 0.9224
Epoch 2/20
 - 1s - loss: 0.2571 - binary_accuracy: 0.9191 - val_loss: 0.2459 - val_binary_accuracy: 0.9224
Epoch 3/20
 - 1s - loss: 0.2530 - binary_accuracy: 0.9191 - val_loss: 0.2443 - val_binary_accuracy: 0.9222
Epoch 4/20
 - 1s - loss: 0.2513 - binary_accuracy: 0.9191 - val_loss: 0.2433 - val_binary_accuracy: 0.9222
Epoch 5/20
 - 1s - loss: 0.2505 - binary_accuracy: 0.9190 - val_loss: 0.2423 - val_binary_accuracy: 0.9220
Epoch 6/20
 - 1s - loss: 0.2498 - binary_accuracy: 0.9191 - val_loss: 0.2466 - val_binary_accuracy: 0.9214


## Wide Neural Nets

In [5]:
pipeline = get_standard_pipeline(cat_attribs=CATEGORY_COLS_logit, num_attribs=NUMERIC_COLS_logit)
X, Y, pipe = pre_process(phase = "train", preproc_pipeline = pipeline)

early_stopping = EarlyStopping(patience = 1, restore_best_weights = True)
clf = KerasClassifier(build_fn = get_keras_model, 
                      validation_split = 0.1, 
                      epochs = 25, 
                      verbose = 2)
search_grid = {'drop_fraction':[None, 0.25], 
               'architecture':[[298,1024,1024],[298,512,512],[298,256,256]],
               'batch_size':[8,32]}
gs = GridSearchCV(
    estimator = clf,
    param_grid = search_grid,
    scoring = 'roc_auc',  
    cv = 3,
    verbose=10,
    fit_params = {'callbacks': [early_stopping]}
)



loaded 307511 records
features from previous application
features from bureau
features from credit card balance
features from installments
features from application
start pipeline


NameError: name 'EarlyStopping' is not defined

In [5]:
# run test
run_test(X, Y, 
         search = gs, 
         experiment_name = "NN", 
         test_description = 'larger architectures, relu activation, smaller mini-batches',
         pipeline_named_steps = get_pipe_named_steps(pipeline), 
         num_attribs = NUMERIC_COLS_logit, 
         cat_attribs = CATEGORY_COLS_logit)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] architecture=[298, 1024, 1024], batch_size=8, drop_fraction=None 
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 84s - loss: 0.2660 - binary_accuracy: 0.9186 - val_loss: 0.2472 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 81s - loss: 0.2657 - binary_accuracy: 0.9187 - val_loss: 0.2534 - val_binary_accuracy: 0.9219
[CV]  architecture=[298, 1024, 1024], batch_size=8, drop_fraction=None, score=0.7242726725904918, total= 2.9min
[CV] architecture=[298, 1024, 1024], batch_size=8, drop_fraction=None 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.0min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 82s - loss: 0.2639 - binary_accuracy: 0.9196 - val_loss: 0.2544 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 81s - loss: 0.2632 - binary_accuracy: 0.9196 - val_loss: 0.2507 - val_binary_accuracy: 0.9219
Epoch 3/25
 - 81s - loss: 0.2632 - binary_accuracy: 0.9196 - val_loss: 0.2496 - val_binary_accuracy: 0.9219
Epoch 4/25
 - 82s - loss: 0.2629 - binary_accuracy: 0.9196 - val_loss: 0.2562 - val_binary_accuracy: 0.9219
[CV]  architecture=[298, 1024, 1024], batch_size=8, drop_fraction=None, score=0.7333799771880454, total= 5.5min
[CV] architecture=[298, 1024, 1024], batch_size=8, drop_fraction=None 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.8min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 83s - loss: 0.2652 - binary_accuracy: 0.9192 - val_loss: 0.2561 - val_binary_accuracy: 0.9196
Epoch 2/25
 - 82s - loss: 0.2637 - binary_accuracy: 0.9193 - val_loss: 0.2591 - val_binary_accuracy: 0.9196
[CV]  architecture=[298, 1024, 1024], batch_size=8, drop_fraction=None, score=0.7274613902873297, total= 2.9min
[CV] architecture=[298, 1024, 1024], batch_size=8, drop_fraction=0.25 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 11.9min remaining:    0.0s


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 88s - loss: 0.2777 - binary_accuracy: 0.9185 - val_loss: 0.2645 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 87s - loss: 0.2814 - binary_accuracy: 0.9187 - val_loss: 0.2883 - val_binary_accuracy: 0.9219
[CV]  architecture=[298, 1024, 1024], batch_size=8, drop_fraction=0.25, score=0.7124201199125946, total= 3.1min
[CV] architecture=[298, 1024, 1024], batch_size=8, drop_fraction=0.25 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 15.2min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 90s - loss: 0.2749 - binary_accuracy: 0.9196 - val_loss: 0.2598 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 89s - loss: 0.2787 - binary_accuracy: 0.9196 - val_loss: 0.2663 - val_binary_accuracy: 0.9218
[CV]  architecture=[298, 1024, 1024], batch_size=8, drop_fraction=0.25, score=0.7155013280631977, total= 3.1min
[CV] architecture=[298, 1024, 1024], batch_size=8, drop_fraction=0.25 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 18.5min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 90s - loss: 0.2769 - binary_accuracy: 0.9192 - val_loss: 0.2696 - val_binary_accuracy: 0.9196
Epoch 2/25
 - 89s - loss: 0.2834 - binary_accuracy: 0.9192 - val_loss: 0.2757 - val_binary_accuracy: 0.9197
[CV]  architecture=[298, 1024, 1024], batch_size=8, drop_fraction=0.25, score=0.719243629367416, total= 3.1min
[CV] architecture=[298, 1024, 1024], batch_size=32, drop_fraction=None 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 21.9min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 22s - loss: 0.2585 - binary_accuracy: 0.9187 - val_loss: 0.2480 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 22s - loss: 0.2547 - binary_accuracy: 0.9187 - val_loss: 0.2479 - val_binary_accuracy: 0.9219
Epoch 3/25
 - 21s - loss: 0.2535 - binary_accuracy: 0.9188 - val_loss: 0.2417 - val_binary_accuracy: 0.9218
Epoch 4/25
 - 22s - loss: 0.2531 - binary_accuracy: 0.9188 - val_loss: 0.2419 - val_binary_accuracy: 0.9205
[CV]  architecture=[298, 1024, 1024], batch_size=32, drop_fraction=None, score=0.7446104770460023, total= 1.5min
[CV] architecture=[298, 1024, 1024], batch_size=32, drop_fraction=None 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 23.5min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 22s - loss: 0.2563 - binary_accuracy: 0.9196 - val_loss: 0.2475 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 21s - loss: 0.2529 - binary_accuracy: 0.9197 - val_loss: 0.2543 - val_binary_accuracy: 0.9220
[CV]  architecture=[298, 1024, 1024], batch_size=32, drop_fraction=None, score=0.740074883906393, total=  46.0s
[CV] architecture=[298, 1024, 1024], batch_size=32, drop_fraction=None 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 24.3min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 22s - loss: 0.2568 - binary_accuracy: 0.9192 - val_loss: 0.2525 - val_binary_accuracy: 0.9196
Epoch 2/25
 - 22s - loss: 0.2533 - binary_accuracy: 0.9193 - val_loss: 0.2522 - val_binary_accuracy: 0.9196
Epoch 3/25
 - 22s - loss: 0.2526 - binary_accuracy: 0.9193 - val_loss: 0.2541 - val_binary_accuracy: 0.9198
[CV]  architecture=[298, 1024, 1024], batch_size=32, drop_fraction=None, score=0.7419713648699664, total= 1.1min
[CV] architecture=[298, 1024, 1024], batch_size=32, drop_fraction=0.25 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 25.5min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 24s - loss: 0.2625 - binary_accuracy: 0.9185 - val_loss: 0.2423 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 23s - loss: 0.2590 - binary_accuracy: 0.9187 - val_loss: 0.2466 - val_binary_accuracy: 0.9219
[CV]  architecture=[298, 1024, 1024], batch_size=32, drop_fraction=0.25, score=0.7392518829164154, total=  49.9s
[CV] architecture=[298, 1024, 1024], batch_size=32, drop_fraction=0.25 
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 24s - loss: 0.2596 - binary_accuracy: 0.9195 - val_loss: 0.2449 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 23s - loss: 0.2570 - binary_accuracy: 0.9196 - val_loss: 0.2432 - val_binary_accuracy: 0.9219
Epoch 3/25
 - 23s - loss: 0.2563 - binary_accuracy: 0.9196 - val_loss: 0.2504 - val_binary_accuracy: 0.9219
[CV]  architecture=[298, 1024, 1024], batch_size=32, drop_fraction=0.25, score=0.7406565029458219, total= 1.2min
[CV] architecture=[298, 1024, 1024], batch_size=32, drop_fra

Epoch 2/25
 - 62s - loss: 0.2631 - binary_accuracy: 0.9196 - val_loss: 0.2588 - val_binary_accuracy: 0.9219
[CV]  architecture=[298, 256, 256], batch_size=8, drop_fraction=None, score=0.7222821011394849, total= 2.3min
[CV] architecture=[298, 256, 256], batch_size=8, drop_fraction=None ..
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 64s - loss: 0.2645 - binary_accuracy: 0.9193 - val_loss: 0.2622 - val_binary_accuracy: 0.9196
Epoch 2/25
 - 62s - loss: 0.2637 - binary_accuracy: 0.9192 - val_loss: 0.2623 - val_binary_accuracy: 0.9196
[CV]  architecture=[298, 256, 256], batch_size=8, drop_fraction=None, score=0.7255510891894679, total= 2.3min
[CV] architecture=[298, 256, 256], batch_size=8, drop_fraction=0.25 ..
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 68s - loss: 0.2733 - binary_accuracy: 0.9187 - val_loss: 0.2502 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 66s - loss: 0.2744 - binary_accuracy: 0.9187 - val_loss: 0.2599 - val_binary_accuracy: 0.9

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 75.9min finished


Train on 249083 samples, validate on 27676 samples
Epoch 1/25
 - 27s - loss: 0.2548 - binary_accuracy: 0.9191 - val_loss: 0.2406 - val_binary_accuracy: 0.9225
Epoch 2/25
 - 26s - loss: 0.2522 - binary_accuracy: 0.9191 - val_loss: 0.2430 - val_binary_accuracy: 0.9223


## SELU activation 

Klambauer, et al. proposed ["self-normalizing neural networks"](https://arxiv.org/abs/1706.02515) that use scaled exponential linear unit (SELU) activation. This activation makes the weight vectors converge toward zero mean and unit variance, which allows training to explore the parameter space without encountering exploding or disappearing gradients. Consequently, feed-forward networks can explore a larger parameter space for more epochs--or so the hypothesis goes. Let us see for ourselves!

In [10]:
#pipeline = get_standard_pipeline(cat_attribs=CATEGORY_COLS_logit, num_attribs=NUMERIC_COLS_logit)
#X, Y, pipe = pre_process(phase = "train", preproc_pipeline = pipeline)


patient_early_stopping = EarlyStopping(patience = 2, restore_best_weights = True)
selu_clf = KerasClassifier(build_fn = get_selu_activation_model, 
                      validation_split = 0.1, 
                      epochs = 25, 
                      verbose = 2)
selu_search_grid = {'drop_fraction':[None, 0.25], 
               'architecture':[[298,512,512],[298,256,256],[298,512,512,256],[298,256,256,128]],
               'batch_size':[8,32]}
sgs = GridSearchCV(
    estimator = selu_clf,
    param_grid = selu_search_grid,
    scoring = 'roc_auc',  
    cv = 3,
    verbose=10,
    fit_params = {'callbacks': [patient_early_stopping]}
)

# run test
run_test(X, Y, 
         search = sgs, 
         experiment_name = "NN", 
         test_description = 'larger architectures, selu activation, smaller mini-batches',
         pipeline_named_steps = get_pipe_named_steps(pipeline), 
         num_attribs = NUMERIC_COLS_logit, 
         cat_attribs = CATEGORY_COLS_logit)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] architecture=[298, 512, 512], batch_size=8, drop_fraction=None ..
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 67s - loss: 0.2694 - binary_accuracy: 0.9175 - val_loss: 0.2508 - val_binary_accuracy: 0.9217
Epoch 2/25
 - 64s - loss: 0.2651 - binary_accuracy: 0.9176 - val_loss: 0.2508 - val_binary_accuracy: 0.9218
Epoch 3/25
 - 64s - loss: 0.2657 - binary_accuracy: 0.9176 - val_loss: 0.2451 - val_binary_accuracy: 0.9218
Epoch 4/25
 - 64s - loss: 0.2655 - binary_accuracy: 0.9176 - val_loss: 0.2553 - val_binary_accuracy: 0.9196
Epoch 5/25
 - 64s - loss: 0.2664 - binary_accuracy: 0.9172 - val_loss: 0.2485 - val_binary_accuracy: 0.9218
[CV]  architecture=[298, 512, 512], batch_size=8, drop_fraction=None, score=0.7289892235670072, total= 5.5min
[CV] architecture=[298, 512, 512], batch_size=8

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 64s - loss: 0.2675 - binary_accuracy: 0.9181 - val_loss: 0.2468 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 64s - loss: 0.2635 - binary_accuracy: 0.9188 - val_loss: 0.2452 - val_binary_accuracy: 0.9218
Epoch 3/25
 - 64s - loss: 0.2637 - binary_accuracy: 0.9186 - val_loss: 0.2554 - val_binary_accuracy: 0.9218
Epoch 4/25
 - 64s - loss: 0.2639 - binary_accuracy: 0.9186 - val_loss: 0.2541 - val_binary_accuracy: 0.9215
[CV]  architecture=[298, 512, 512], batch_size=8, drop_fraction=None, score=0.727110365504903, total= 4.4min
[CV] architecture=[298, 512, 512], batch_size=8, drop_fraction=None ..


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 10.4min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 65s - loss: 0.2689 - binary_accuracy: 0.9179 - val_loss: 0.2555 - val_binary_accuracy: 0.9197
Epoch 2/25
 - 64s - loss: 0.2639 - binary_accuracy: 0.9186 - val_loss: 0.2873 - val_binary_accuracy: 0.9195
Epoch 3/25
 - 64s - loss: 0.2638 - binary_accuracy: 0.9183 - val_loss: 0.3141 - val_binary_accuracy: 0.9089
[CV]  architecture=[298, 512, 512], batch_size=8, drop_fraction=None, score=0.7317616970871867, total= 3.4min
[CV] architecture=[298, 512, 512], batch_size=8, drop_fraction=0.25 ..


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 14.0min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 70s - loss: 0.2759 - binary_accuracy: 0.9170 - val_loss: 0.4012 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 69s - loss: 0.2733 - binary_accuracy: 0.9170 - val_loss: 0.3559 - val_binary_accuracy: 0.9218
Epoch 3/25
 - 69s - loss: 0.2748 - binary_accuracy: 0.9170 - val_loss: 0.4833 - val_binary_accuracy: 0.9218
Epoch 4/25
 - 69s - loss: 0.2747 - binary_accuracy: 0.9168 - val_loss: 0.3055 - val_binary_accuracy: 0.9220
Epoch 5/25
 - 69s - loss: 0.2757 - binary_accuracy: 0.9167 - val_loss: 0.3979 - val_binary_accuracy: 0.9219
Epoch 6/25
 - 69s - loss: 0.2761 - binary_accuracy: 0.9169 - val_loss: 0.3939 - val_binary_accuracy: 0.9219
[CV]  architecture=[298, 512, 512], batch_size=8, drop_fraction=0.25, score=0.7182504395798047, total= 7.1min
[CV] architecture=[298, 512, 512], batch_size=8, drop_fraction=0.25 ..


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 21.3min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 70s - loss: 0.2736 - binary_accuracy: 0.9175 - val_loss: 0.3113 - val_binary_accuracy: 0.9200
Epoch 2/25
 - 69s - loss: 0.2716 - binary_accuracy: 0.9179 - val_loss: 0.4005 - val_binary_accuracy: 0.9219
Epoch 3/25
 - 69s - loss: 0.2719 - binary_accuracy: 0.9179 - val_loss: 0.5130 - val_binary_accuracy: 0.9218
[CV]  architecture=[298, 512, 512], batch_size=8, drop_fraction=0.25, score=0.7165752765849227, total= 3.6min
[CV] architecture=[298, 512, 512], batch_size=8, drop_fraction=0.25 ..


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 25.2min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 70s - loss: 0.2749 - binary_accuracy: 0.9169 - val_loss: 0.4062 - val_binary_accuracy: 0.9196
Epoch 2/25
 - 70s - loss: 0.2721 - binary_accuracy: 0.9179 - val_loss: 0.3833 - val_binary_accuracy: 0.9195
Epoch 3/25
 - 70s - loss: 0.2728 - binary_accuracy: 0.9174 - val_loss: 0.4369 - val_binary_accuracy: 0.9196
Epoch 4/25
 - 70s - loss: 0.2730 - binary_accuracy: 0.9177 - val_loss: 0.3916 - val_binary_accuracy: 0.9194
[CV]  architecture=[298, 512, 512], batch_size=8, drop_fraction=0.25, score=0.7301354533342069, total= 4.8min
[CV] architecture=[298, 512, 512], batch_size=32, drop_fraction=None .


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 30.3min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 17s - loss: 0.2645 - binary_accuracy: 0.9167 - val_loss: 0.2554 - val_binary_accuracy: 0.9220
Epoch 2/25
 - 17s - loss: 0.2553 - binary_accuracy: 0.9182 - val_loss: 0.2454 - val_binary_accuracy: 0.9217
Epoch 3/25
 - 17s - loss: 0.2545 - binary_accuracy: 0.9185 - val_loss: 0.2425 - val_binary_accuracy: 0.9220
Epoch 4/25
 - 17s - loss: 0.2536 - binary_accuracy: 0.9185 - val_loss: 0.2432 - val_binary_accuracy: 0.9219
Epoch 5/25
 - 17s - loss: 0.2527 - binary_accuracy: 0.9185 - val_loss: 0.2533 - val_binary_accuracy: 0.9220
[CV]  architecture=[298, 512, 512], batch_size=32, drop_fraction=None, score=0.7507895689201971, total= 1.4min
[CV] architecture=[298, 512, 512], batch_size=32, drop_fraction=None .


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 31.8min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 17s - loss: 0.2627 - binary_accuracy: 0.9175 - val_loss: 0.2428 - val_binary_accuracy: 0.9220
Epoch 2/25
 - 17s - loss: 0.2532 - binary_accuracy: 0.9194 - val_loss: 0.2437 - val_binary_accuracy: 0.9219
Epoch 3/25
 - 17s - loss: 0.2519 - binary_accuracy: 0.9193 - val_loss: 0.2490 - val_binary_accuracy: 0.9216
[CV]  architecture=[298, 512, 512], batch_size=32, drop_fraction=None, score=0.7393693745570437, total=  53.5s
[CV] architecture=[298, 512, 512], batch_size=32, drop_fraction=None .


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 32.8min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 17s - loss: 0.2613 - binary_accuracy: 0.9178 - val_loss: 0.2526 - val_binary_accuracy: 0.9199
Epoch 2/25
 - 17s - loss: 0.2535 - binary_accuracy: 0.9190 - val_loss: 0.2595 - val_binary_accuracy: 0.9171
Epoch 3/25
 - 17s - loss: 0.2525 - binary_accuracy: 0.9191 - val_loss: 0.2554 - val_binary_accuracy: 0.9196
[CV]  architecture=[298, 512, 512], batch_size=32, drop_fraction=None, score=0.7489278427858065, total=  54.0s
[CV] architecture=[298, 512, 512], batch_size=32, drop_fraction=0.25 .


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 33.8min remaining:    0.0s


Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 19s - loss: 0.2661 - binary_accuracy: 0.9169 - val_loss: 0.3455 - val_binary_accuracy: 0.9220
Epoch 2/25
 - 18s - loss: 0.2590 - binary_accuracy: 0.9180 - val_loss: 0.3503 - val_binary_accuracy: 0.9219
Epoch 3/25
 - 18s - loss: 0.2587 - binary_accuracy: 0.9182 - val_loss: 0.2988 - val_binary_accuracy: 0.8895
Epoch 4/25
 - 18s - loss: 0.2582 - binary_accuracy: 0.9182 - val_loss: 0.3278 - val_binary_accuracy: 0.9217
Epoch 5/25
 - 18s - loss: 0.2580 - binary_accuracy: 0.9181 - val_loss: 0.3210 - val_binary_accuracy: 0.9213
[CV]  architecture=[298, 512, 512], batch_size=32, drop_fraction=0.25, score=0.7429062374958055, total= 1.6min
[CV] architecture=[298, 512, 512], batch_size=32, drop_fraction=0.25 .
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 19s - loss: 0.2641 - binary_accuracy: 0.9179 - val_loss: 0.3179 - val_binary_accuracy: 0.9219
Epoch 2/25
 - 18s - loss: 0.2569 - binary_accuracy: 0.9190 - val_los

Epoch 5/25
 - 17s - loss: 0.2500 - binary_accuracy: 0.9192 - val_loss: 0.2509 - val_binary_accuracy: 0.9203
Epoch 6/25
 - 17s - loss: 0.2496 - binary_accuracy: 0.9192 - val_loss: 0.2612 - val_binary_accuracy: 0.9199
Epoch 7/25
 - 17s - loss: 0.2491 - binary_accuracy: 0.9193 - val_loss: 0.2499 - val_binary_accuracy: 0.9202
Epoch 8/25
 - 17s - loss: 0.2488 - binary_accuracy: 0.9192 - val_loss: 0.2485 - val_binary_accuracy: 0.9200
Epoch 9/25
 - 17s - loss: 0.2486 - binary_accuracy: 0.9191 - val_loss: 0.2574 - val_binary_accuracy: 0.9197
Epoch 10/25
 - 17s - loss: 0.2482 - binary_accuracy: 0.9194 - val_loss: 0.2544 - val_binary_accuracy: 0.9201
[CV]  architecture=[298, 256, 256], batch_size=32, drop_fraction=None, score=0.7482451574474822, total= 2.9min
[CV] architecture=[298, 256, 256], batch_size=32, drop_fraction=0.25 .
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 19s - loss: 0.2627 - binary_accuracy: 0.9177 - val_loss: 0.3534 - val_binary_accuracy: 0.9202
Epoch 2/25

[CV]  architecture=[298, 512, 512, 256], batch_size=32, drop_fraction=None, score=0.7450741783434739, total= 3.6min
[CV] architecture=[298, 512, 512, 256], batch_size=32, drop_fraction=None 
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 22s - loss: 0.2653 - binary_accuracy: 0.9172 - val_loss: 0.2755 - val_binary_accuracy: 0.9221
Epoch 2/25
 - 21s - loss: 0.2540 - binary_accuracy: 0.9192 - val_loss: 0.2486 - val_binary_accuracy: 0.9218
Epoch 3/25
 - 21s - loss: 0.2530 - binary_accuracy: 0.9194 - val_loss: 0.2580 - val_binary_accuracy: 0.9220
Epoch 4/25
 - 21s - loss: 0.2524 - binary_accuracy: 0.9193 - val_loss: 0.2407 - val_binary_accuracy: 0.9221
Epoch 5/25
 - 21s - loss: 0.2516 - binary_accuracy: 0.9194 - val_loss: 0.2722 - val_binary_accuracy: 0.9220
Epoch 6/25
 - 21s - loss: 0.2510 - binary_accuracy: 0.9194 - val_loss: 0.2439 - val_binary_accuracy: 0.9220
[CV]  architecture=[298, 512, 512, 256], batch_size=32, drop_fraction=None, score=0.7491279370633239, total= 2

 - 21s - loss: 0.2527 - binary_accuracy: 0.9186 - val_loss: 0.2444 - val_binary_accuracy: 0.9216
[CV]  architecture=[298, 256, 256, 128], batch_size=32, drop_fraction=None, score=0.745016733387099, total= 1.2min
[CV] architecture=[298, 256, 256, 128], batch_size=32, drop_fraction=None 
Train on 166055 samples, validate on 18451 samples
Epoch 1/25
 - 23s - loss: 0.2596 - binary_accuracy: 0.9186 - val_loss: 0.2436 - val_binary_accuracy: 0.9220
Epoch 2/25
 - 21s - loss: 0.2521 - binary_accuracy: 0.9197 - val_loss: 0.2450 - val_binary_accuracy: 0.9220
Epoch 3/25
 - 21s - loss: 0.2508 - binary_accuracy: 0.9193 - val_loss: 0.2411 - val_binary_accuracy: 0.9220
Epoch 4/25
 - 21s - loss: 0.2503 - binary_accuracy: 0.9196 - val_loss: 0.2404 - val_binary_accuracy: 0.9218
Epoch 5/25
 - 21s - loss: 0.2498 - binary_accuracy: 0.9196 - val_loss: 0.2437 - val_binary_accuracy: 0.9203
Epoch 6/25
 - 21s - loss: 0.2495 - binary_accuracy: 0.9196 - val_loss: 0.2469 - val_binary_accuracy: 0.9207
[CV]  architec

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 180.5min finished


Train on 249083 samples, validate on 27676 samples
Epoch 1/25
 - 32s - loss: 0.2573 - binary_accuracy: 0.9184 - val_loss: 0.2454 - val_binary_accuracy: 0.9225
Epoch 2/25
 - 29s - loss: 0.2524 - binary_accuracy: 0.9189 - val_loss: 0.2433 - val_binary_accuracy: 0.9225
Epoch 3/25
 - 30s - loss: 0.2516 - binary_accuracy: 0.9190 - val_loss: 0.2405 - val_binary_accuracy: 0.9226
Epoch 4/25
 - 29s - loss: 0.2510 - binary_accuracy: 0.9189 - val_loss: 0.2411 - val_binary_accuracy: 0.9224
Epoch 5/25
 - 30s - loss: 0.2507 - binary_accuracy: 0.9190 - val_loss: 0.2445 - val_binary_accuracy: 0.9209


## Refinement Testing
The best neural network model gets better scores than other models'. However, this success has been achieved at a cost of greater cost in computational resources.

What have we learned so far about neural network architecture and training? 
* Dropout regularization does not help at all.
* SELU activation gets slightly better results than RELU activation, but at the cost of tripling training time and increasing prediction time by 30%.
* Batch size of 32 is better than 8.
* The best architecture hidden layers of 256-256.
* Test scores are consistently better than training validation scores. This is probably the result of training the model on more observations in the test phase than in training/cross-validation. In other words, neural networks seem to benefit more from increasing observations than other model types.

So let us proceed to a final exploration of the hyper-parameters in the vicinity of our best results so far. The exploration will use RELU activation instead SELU in order to finish as quickly as possible.

In [5]:
pipeline = get_standard_pipeline(cat_attribs=CATEGORY_COLS_logit, num_attribs=NUMERIC_COLS_logit)
X, Y, pipe = pre_process(phase = "train", preproc_pipeline = pipeline)

early_stopping = EarlyStopping(patience = 2, restore_best_weights = True)
clf = KerasClassifier(build_fn = get_keras_model, 
                      validation_split = 0.1, 
                      epochs = 25, 
                      verbose = 2)
search_grid = {'architecture':[[298,320,320],[298,256,256],[298,192,192],[298,320,320,16],[298,256,256,16],[298,192,192,16]],
               'batch_size':[32,48]}
gs = GridSearchCV(
    estimator = clf,
    param_grid = search_grid,
    scoring = 'roc_auc',  
    cv = 5,
    verbose=10,
    fit_params = {'callbacks': [early_stopping]}
)


loaded 307511 records
features from previous application
features from bureau
features from credit card balance
features from installments
features from application
start pipeline


In [9]:
# run test
run_test(X, Y, 
         search = gs, 
         experiment_name = "NN", 
         test_description = 'refinement in the vicinity of previous best results',
         pipeline_named_steps = get_pipe_named_steps(pipeline), 
         num_attribs = NUMERIC_COLS_logit, 
         cat_attribs = CATEGORY_COLS_logit)

## Activation Function: RELU vs. SELU
The best feed-forward network architecture seems to be 320-320-16. But with which activation function? We already have the results for RELU, so let's get the results for SELU and compare.

In [4]:
pipeline = get_standard_pipeline(cat_attribs=CATEGORY_COLS_logit, num_attribs=NUMERIC_COLS_logit)
X, Y, pipe = pre_process(phase = "train", preproc_pipeline = pipeline)

patient_early_stopping = EarlyStopping(patience = 2, restore_best_weights = True)
selu_clf = KerasClassifier(build_fn = get_selu_activation_model, 
                      validation_split = 0.1, 
                      epochs = 25, 
                      verbose = 2)
selu_search_grid = {'architecture':[[298,320,320,16]],
                    'batch_size':[32,48]}
sgs = GridSearchCV(
    estimator = selu_clf,
    param_grid = selu_search_grid,
    scoring = 'roc_auc',  
    cv = 5,
    verbose=10,
    fit_params = {'callbacks': [patient_early_stopping]}
)



loaded 307511 records
features from previous application
features from bureau
features from credit card balance
features from installments
features from application
start pipeline


AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [6]:
# run test
run_test(X, Y, 
         search = sgs, 
         experiment_name = "NN", 
         test_description = 'final test for SELU vs. RELU with best architecture (320-320-16)',
         pipeline_named_steps = get_pipe_named_steps(pipeline), 
         num_attribs = NUMERIC_COLS_logit, 
         cat_attribs = CATEGORY_COLS_logit)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] architecture=[298, 320, 320, 16], batch_size=32 .................
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 199266 samples, validate on 22141 samples
Epoch 1/25
 - 24s - loss: 0.2565 - binary_accuracy: 0.9184 - val_loss: 0.2433 - val_binary_accuracy: 0.9222
Epoch 2/25
 - 22s - loss: 0.2528 - binary_accuracy: 0.9185 - val_loss: 0.2449 - val_binary_accuracy: 0.9221
Epoch 3/25
 - 22s - loss: 0.2515 - binary_accuracy: 0.9188 - val_loss: 0.2446 - val_binary_accuracy: 0.9222
[CV]  architecture=[298, 320, 320, 16], batch_size=32, score=0.7444398841027781, total= 1.2min
[CV] architecture=[298, 320, 320, 16], batch_size=32 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


Train on 199266 samples, validate on 22141 samples
Epoch 1/25
 - 22s - loss: 0.2569 - binary_accuracy: 0.9191 - val_loss: 0.2457 - val_binary_accuracy: 0.9221
Epoch 2/25
 - 21s - loss: 0.2519 - binary_accuracy: 0.9194 - val_loss: 0.2423 - val_binary_accuracy: 0.9221
Epoch 3/25
 - 22s - loss: 0.2505 - binary_accuracy: 0.9194 - val_loss: 0.2507 - val_binary_accuracy: 0.9221
Epoch 4/25
 - 22s - loss: 0.2499 - binary_accuracy: 0.9194 - val_loss: 0.2424 - val_binary_accuracy: 0.9215
[CV]  architecture=[298, 320, 320, 16], batch_size=32, score=0.7523339699212804, total= 1.5min
[CV] architecture=[298, 320, 320, 16], batch_size=32 .................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.8min remaining:    0.0s


Train on 199266 samples, validate on 22141 samples
Epoch 1/25
 - 23s - loss: 0.2561 - binary_accuracy: 0.9189 - val_loss: 0.2473 - val_binary_accuracy: 0.9222
Epoch 2/25
 - 22s - loss: 0.2519 - binary_accuracy: 0.9192 - val_loss: 0.2539 - val_binary_accuracy: 0.9217
Epoch 3/25
 - 22s - loss: 0.2511 - binary_accuracy: 0.9192 - val_loss: 0.2505 - val_binary_accuracy: 0.9223
[CV]  architecture=[298, 320, 320, 16], batch_size=32, score=0.7400759280678266, total= 1.1min
[CV] architecture=[298, 320, 320, 16], batch_size=32 .................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.0min remaining:    0.0s


Train on 199266 samples, validate on 22141 samples
Epoch 1/25
 - 24s - loss: 0.2548 - binary_accuracy: 0.9187 - val_loss: 0.2427 - val_binary_accuracy: 0.9220
Epoch 2/25
 - 22s - loss: 0.2519 - binary_accuracy: 0.9189 - val_loss: 0.2420 - val_binary_accuracy: 0.9214
Epoch 3/25
 - 22s - loss: 0.2517 - binary_accuracy: 0.9189 - val_loss: 0.2422 - val_binary_accuracy: 0.9215
Epoch 4/25
 - 22s - loss: 0.2517 - binary_accuracy: 0.9187 - val_loss: 0.2412 - val_binary_accuracy: 0.9221
Epoch 5/25
 - 23s - loss: 0.2507 - binary_accuracy: 0.9189 - val_loss: 0.2429 - val_binary_accuracy: 0.9222
Epoch 6/25
 - 22s - loss: 0.2498 - binary_accuracy: 0.9190 - val_loss: 0.2411 - val_binary_accuracy: 0.9222
Epoch 7/25
 - 21s - loss: 0.2491 - binary_accuracy: 0.9190 - val_loss: 0.2416 - val_binary_accuracy: 0.9222
Epoch 8/25
 - 22s - loss: 0.2487 - binary_accuracy: 0.9188 - val_loss: 0.2430 - val_binary_accuracy: 0.9216
[CV]  architecture=[298, 320, 320, 16], batch_size=32, score=0.7490305404902448, tota

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  7.1min remaining:    0.0s


Train on 199267 samples, validate on 22141 samples
Epoch 1/25
 - 22s - loss: 0.2601 - binary_accuracy: 0.9185 - val_loss: 0.2466 - val_binary_accuracy: 0.9207
Epoch 2/25
 - 22s - loss: 0.2527 - binary_accuracy: 0.9191 - val_loss: 0.2460 - val_binary_accuracy: 0.9212
Epoch 3/25
 - 21s - loss: 0.2511 - binary_accuracy: 0.9192 - val_loss: 0.2496 - val_binary_accuracy: 0.9192
Epoch 4/25
 - 22s - loss: 0.2503 - binary_accuracy: 0.9192 - val_loss: 0.2469 - val_binary_accuracy: 0.9213
[CV]  architecture=[298, 320, 320, 16], batch_size=32, score=0.7529965604686525, total= 1.5min
[CV] architecture=[298, 320, 320, 16], batch_size=48 .................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.7min remaining:    0.0s


Train on 199266 samples, validate on 22141 samples
Epoch 1/25
 - 15s - loss: 0.2553 - binary_accuracy: 0.9186 - val_loss: 0.2417 - val_binary_accuracy: 0.9220
Epoch 2/25
 - 15s - loss: 0.2515 - binary_accuracy: 0.9187 - val_loss: 0.2458 - val_binary_accuracy: 0.9221
Epoch 3/25
 - 15s - loss: 0.2512 - binary_accuracy: 0.9189 - val_loss: 0.2412 - val_binary_accuracy: 0.9221
Epoch 4/25
 - 15s - loss: 0.2506 - binary_accuracy: 0.9190 - val_loss: 0.2402 - val_binary_accuracy: 0.9223
Epoch 5/25
 - 15s - loss: 0.2498 - binary_accuracy: 0.9188 - val_loss: 0.2404 - val_binary_accuracy: 0.9222
Epoch 6/25
 - 15s - loss: 0.2494 - binary_accuracy: 0.9189 - val_loss: 0.2410 - val_binary_accuracy: 0.9221
[CV]  architecture=[298, 320, 320, 16], batch_size=48, score=0.7515161589219201, total= 1.5min
[CV] architecture=[298, 320, 320, 16], batch_size=48 .................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 10.3min remaining:    0.0s


Train on 199266 samples, validate on 22141 samples
Epoch 1/25
 - 15s - loss: 0.2537 - binary_accuracy: 0.9192 - val_loss: 0.2421 - val_binary_accuracy: 0.9221
Epoch 2/25
 - 15s - loss: 0.2506 - binary_accuracy: 0.9193 - val_loss: 0.2431 - val_binary_accuracy: 0.9214
Epoch 3/25
 - 15s - loss: 0.2500 - binary_accuracy: 0.9193 - val_loss: 0.2416 - val_binary_accuracy: 0.9220
Epoch 4/25
 - 15s - loss: 0.2495 - binary_accuracy: 0.9192 - val_loss: 0.2443 - val_binary_accuracy: 0.9221
Epoch 5/25
 - 14s - loss: 0.2489 - binary_accuracy: 0.9193 - val_loss: 0.2419 - val_binary_accuracy: 0.9221
[CV]  architecture=[298, 320, 320, 16], batch_size=48, score=0.7520527649564853, total= 1.3min
[CV] architecture=[298, 320, 320, 16], batch_size=48 .................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 11.6min remaining:    0.0s


Train on 199266 samples, validate on 22141 samples
Epoch 1/25
 - 15s - loss: 0.2596 - binary_accuracy: 0.9181 - val_loss: 0.2800 - val_binary_accuracy: 0.8883
Epoch 2/25
 - 15s - loss: 0.2518 - binary_accuracy: 0.9190 - val_loss: 0.2442 - val_binary_accuracy: 0.9222
Epoch 3/25
 - 15s - loss: 0.2496 - binary_accuracy: 0.9192 - val_loss: 0.2496 - val_binary_accuracy: 0.9221
Epoch 4/25
 - 15s - loss: 0.2489 - binary_accuracy: 0.9191 - val_loss: 0.2406 - val_binary_accuracy: 0.9222
Epoch 5/25
 - 15s - loss: 0.2484 - binary_accuracy: 0.9191 - val_loss: 0.2405 - val_binary_accuracy: 0.9218
Epoch 6/25
 - 15s - loss: 0.2479 - binary_accuracy: 0.9192 - val_loss: 0.2414 - val_binary_accuracy: 0.9222
Epoch 7/25
 - 15s - loss: 0.2474 - binary_accuracy: 0.9194 - val_loss: 0.2414 - val_binary_accuracy: 0.9222
[CV]  architecture=[298, 320, 320, 16], batch_size=48, score=0.7523006980607122, total= 1.8min
[CV] architecture=[298, 320, 320, 16], batch_size=48 .................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 13.5min remaining:    0.0s


Train on 199266 samples, validate on 22141 samples
Epoch 1/25
 - 16s - loss: 0.2568 - binary_accuracy: 0.9185 - val_loss: 0.2410 - val_binary_accuracy: 0.9224
Epoch 2/25
 - 14s - loss: 0.2520 - binary_accuracy: 0.9189 - val_loss: 0.2462 - val_binary_accuracy: 0.9223
Epoch 3/25
 - 15s - loss: 0.2509 - binary_accuracy: 0.9189 - val_loss: 0.2423 - val_binary_accuracy: 0.9210
[CV]  architecture=[298, 320, 320, 16], batch_size=48, score=0.7466998591126357, total=  47.0s
[CV] architecture=[298, 320, 320, 16], batch_size=48 .................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 14.3min remaining:    0.0s


Train on 199267 samples, validate on 22141 samples
Epoch 1/25
 - 16s - loss: 0.2540 - binary_accuracy: 0.9190 - val_loss: 0.2477 - val_binary_accuracy: 0.9213
Epoch 2/25
 - 15s - loss: 0.2510 - binary_accuracy: 0.9192 - val_loss: 0.2549 - val_binary_accuracy: 0.9212
Epoch 3/25
 - 15s - loss: 0.2505 - binary_accuracy: 0.9191 - val_loss: 0.2462 - val_binary_accuracy: 0.9200
Epoch 4/25
 - 14s - loss: 0.2496 - binary_accuracy: 0.9192 - val_loss: 0.2470 - val_binary_accuracy: 0.9213
Epoch 5/25
 - 15s - loss: 0.2487 - binary_accuracy: 0.9191 - val_loss: 0.2464 - val_binary_accuracy: 0.9206
[CV]  architecture=[298, 320, 320, 16], batch_size=48, score=0.7538730642195899, total= 1.3min


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 15.7min finished


Train on 249083 samples, validate on 27676 samples
Epoch 1/25
 - 19s - loss: 0.2568 - binary_accuracy: 0.9186 - val_loss: 0.2485 - val_binary_accuracy: 0.9225
Epoch 2/25
 - 19s - loss: 0.2516 - binary_accuracy: 0.9190 - val_loss: 0.2423 - val_binary_accuracy: 0.9225
Epoch 3/25
 - 19s - loss: 0.2508 - binary_accuracy: 0.9190 - val_loss: 0.2404 - val_binary_accuracy: 0.9225
Epoch 4/25
 - 19s - loss: 0.2501 - binary_accuracy: 0.9190 - val_loss: 0.2416 - val_binary_accuracy: 0.9225
Epoch 5/25
 - 20s - loss: 0.2491 - binary_accuracy: 0.9191 - val_loss: 0.2446 - val_binary_accuracy: 0.9224


## Too close to call, so more testing ....
The train AUC of the best RELU and SELU models were basically equivalent (.7518 and .7513, respectively). However, the SELU activation model's test AUC was modestly more favorable than the RELU model's (.755 and .752, respectively). This result is consistent with the idea that the SELU activation improves more rapidly as the amount of training data increase; however, the scores are not so disparate as to find one activation clearly better than the other.

Consequently, we use 10-fold validation over the entire training set (test verification has already been done). The winner will be the activation that yields the best average training AUC over the 10-fold validation.

In [7]:
from sklearn.model_selection import cross_val_score
import numpy as np

pipeline = get_standard_pipeline(cat_attribs=CATEGORY_COLS_logit, num_attribs=NUMERIC_COLS_logit)
X, Y, pipe = pre_process(phase = "train", preproc_pipeline = pipeline)

patient_early_stopping = EarlyStopping(patience = 2, restore_best_weights = True)
final_architecture = [298,320,320,16]

def get_relu_with_final_arch():
    return get_keras_model(architecture = final_architecture)

relu_clf = KerasClassifier(build_fn = get_relu_with_final_arch, 
                      validation_split = 0.1, 
                      epochs = 25, 
                      verbose = 2)

def get_selu_with_final_arch():
    return get_selu_activation_model(architecture = final_architecture)

selu_clf = KerasClassifier(build_fn = get_selu_with_final_arch, 
                      validation_split = 0.1, 
                      epochs = 25, 
                      verbose = 2)

def get_cross_val_scores(clf):
    return cross_val_score(clf,
                           X, Y,
                           scoring = 'roc_auc',
                           cv = 10,
                           fit_params={'callbacks': [patient_early_stopping],
                                       'batch_size': 48}
                          )

relu_scores = get_cross_val_scores(relu_clf)
selu_scores = get_cross_val_scores(selu_clf)

print(f"relu mean: {np.mean(relu_scores)}, array: {relu_scores}")
print(f"selu mean: {np.mean(selu_scores)}, array: {selu_scores}")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 249083 samples, validate on 27676 samples
Epoch 1/25
 - 21s - loss: 0.2535 - binary_accuracy: 0.9190 - val_loss: 0.2526 - val_binary_accuracy: 0.9208
Epoch 2/25
 - 17s - loss: 0.2502 - binary_accuracy: 0.9191 - val_loss: 0.2467 - val_binary_accuracy: 0.9207
Epoch 3/25
 - 17s - loss: 0.2493 - binary_accuracy: 0.9191 - val_loss: 0.2445 - val_binary_accuracy: 0.9208
Epoch 4/25
 - 17s - loss: 0.2490 - binary_accuracy: 0.9192 - val_loss: 0.2494 - val_binary_accuracy: 0.9208
Epoch 5/25
 - 18s - loss: 0.2484 - binary_accuracy: 0.9193 - val_loss: 0.2450 - val_binary_accuracy: 0.9211
Train on 249084 samples, validate on 27676 samples
Epoch 1/25
 - 17s - loss: 0.2542 - binary_accuracy: 0.9190 - val_loss: 0.2549 - val_binary_accuracy: 0.9207
Epoch 2/25
 - 17s - loss: 0.2506 - binary_accuracy: 0.9190 - val_loss: 0.2498 - val_binary_accuracy: 0.9209
Epoch 3/25
 - 17s - lo

 - 19s - loss: 0.2504 - binary_accuracy: 0.9191 - val_loss: 0.2470 - val_binary_accuracy: 0.9205
Epoch 4/25
 - 19s - loss: 0.2495 - binary_accuracy: 0.9191 - val_loss: 0.2476 - val_binary_accuracy: 0.9208
Epoch 5/25
 - 19s - loss: 0.2491 - binary_accuracy: 0.9191 - val_loss: 0.2489 - val_binary_accuracy: 0.9208
Train on 249084 samples, validate on 27676 samples
Epoch 1/25
 - 20s - loss: 0.2550 - binary_accuracy: 0.9188 - val_loss: 0.2485 - val_binary_accuracy: 0.9205
Epoch 2/25
 - 19s - loss: 0.2499 - binary_accuracy: 0.9193 - val_loss: 0.2535 - val_binary_accuracy: 0.9206
Epoch 3/25
 - 19s - loss: 0.2484 - binary_accuracy: 0.9193 - val_loss: 0.2463 - val_binary_accuracy: 0.9208
Epoch 4/25
 - 19s - loss: 0.2476 - binary_accuracy: 0.9194 - val_loss: 0.2449 - val_binary_accuracy: 0.9206
Epoch 5/25
 - 19s - loss: 0.2471 - binary_accuracy: 0.9193 - val_loss: 0.2459 - val_binary_accuracy: 0.9209
Epoch 6/25
 - 19s - loss: 0.2465 - binary_accuracy: 0.9194 - val_loss: 0.2458 - val_binary_accur

## Preparing the Kaggle submission for the winning neural network
It was a close race, but the victory goes to SELU activation. Let's prepare the Kaggle submission. We train two models: 
* The first is monitored against a holdout validation set of 10% to see how many epochs to run before the overfitting threshold is crossed. 
* The second is trained with the entire training set for the number of epochs identified in the first model's training.

The predictions of both models will be submitted to Kaggle.

In [7]:
pipeline = get_standard_pipeline(cat_attribs=CATEGORY_COLS_logit, num_attribs=NUMERIC_COLS_logit)
X, Y, pipe = pre_process(phase = "train", preproc_pipeline = pipeline)

patient_early_stopping = EarlyStopping(patience = 2, restore_best_weights = True)
best_clf_val = get_selu_with_final_arch()
best_clf_val.fit(X, Y, 
                 batch_size = 48, 
                 epochs = 25, 
                 verbose = 2, 
                 callbacks = [patient_early_stopping], 
                 validation_split=0.1)

test_data, _, _ = pre_process(phase = 'test', preproc_pipeline = pipeline)
preds = best_clf_val.predict_proba(test_data)



loaded 307511 records
features from previous application
features from bureau
features from credit card balance
features from installments
features from application
start pipeline
Instructions for updating:
Use tf.cast instead.
Train on 276759 samples, validate on 30752 samples
Epoch 1/25
 - 21s - loss: 0.2541 - binary_accuracy: 0.9188 - val_loss: 0.2468 - val_binary_accuracy: 0.9209
Epoch 2/25
 - 19s - loss: 0.2495 - binary_accuracy: 0.9192 - val_loss: 0.2461 - val_binary_accuracy: 0.9209
Epoch 3/25
 - 19s - loss: 0.2486 - binary_accuracy: 0.9191 - val_loss: 0.2444 - val_binary_accuracy: 0.9209
Epoch 4/25
 - 19s - loss: 0.2479 - binary_accuracy: 0.9190 - val_loss: 0.2448 - val_binary_accuracy: 0.9212
Epoch 5/25
 - 19s - loss: 0.2474 - binary_accuracy: 0.9192 - val_loss: 0.2459 - val_binary_accuracy: 0.9210
loaded 48744 records
features from previous application
features from bureau
features from credit card balance
features from installments
features from application
start pipeline
lo

KeyError: 'application_test'

In [9]:
submit_df = load_data("application_test")[['SK_ID_CURR']]
submit_df['TARGET'] = preds
submit_df.to_csv("submission_val.csv", index=False)


In [12]:
best_clf = get_selu_with_final_arch()
best_clf.fit(X, Y, 
             batch_size = 48, 
             epochs = 3, 
             verbose = 2)

preds = best_clf.predict_proba(test_data)
submit_df = load_data("application_test")[['SK_ID_CURR']]
submit_df['TARGET'] = preds
submit_df.to_csv("submission.csv", index=False)

Epoch 1/3
 - 21s - loss: 0.2527 - binary_accuracy: 0.9191
Epoch 2/3
 - 20s - loss: 0.2504 - binary_accuracy: 0.9192
Epoch 3/3
 - 21s - loss: 0.2495 - binary_accuracy: 0.9191


### Kaggle score set as holdout validation
Using a portion of your training data for holdout validation means you know when the model is starting to overfit. However, your neural network tends to get lower scores because its accuracy grows proportionately to the amount of training data, and you are not training with all your data. The scores on the initial NN Kaggle submissions demonstrate this trade-off:

![Initial NN submissions](NN_init_submissions.png) 

Is there a way to avoid this trade-off so we can train the neural network on all the data, while still finding the optimal number of training epochs? Yes: use the Kaggle scoreboard test set as a holdout validation set! In the cells below, we increase the number of epochs by one with each submission until we have passed the optimum.

In [16]:
# Try one more epoch => 4
best_clf = get_selu_with_final_arch()
best_clf.fit(X, Y, 
             batch_size = 48, 
             epochs = 4, 
             verbose = 2)

preds = best_clf.predict_proba(test_data)
submit_df = load_data("application_test")[['SK_ID_CURR']]
submit_df['TARGET'] = preds
submit_df.to_csv("submission4.csv", index=False)

Epoch 1/4
 - 21s - loss: 0.2536 - binary_accuracy: 0.9192
Epoch 2/4
 - 21s - loss: 0.2504 - binary_accuracy: 0.9193
Epoch 3/4
 - 21s - loss: 0.2498 - binary_accuracy: 0.9193
Epoch 4/4
 - 21s - loss: 0.2488 - binary_accuracy: 0.9194


In [19]:
# Try one more epoch => 5
best_clf = get_selu_with_final_arch()
best_clf.fit(X, Y, 
             batch_size = 48, 
             epochs = 5, 
             verbose = 2)

preds = best_clf.predict_proba(test_data)
submit_df = load_data("application_test")[['SK_ID_CURR']]
submit_df['TARGET'] = preds
submit_df.to_csv("submission5.csv", index=False)

Epoch 1/5
 - 22s - loss: 0.2550 - binary_accuracy: 0.9188
Epoch 2/5
 - 21s - loss: 0.2503 - binary_accuracy: 0.9193
Epoch 3/5
 - 21s - loss: 0.2493 - binary_accuracy: 0.9193
Epoch 4/5
 - 21s - loss: 0.2485 - binary_accuracy: 0.9193
Epoch 5/5
 - 21s - loss: 0.2479 - binary_accuracy: 0.9192


In [22]:
# Try one more epoch => 6
best_clf = get_selu_with_final_arch()
best_clf.fit(X, Y, 
             batch_size = 48, 
             epochs = 6, 
             verbose = 2)

preds = best_clf.predict_proba(test_data)
submit_df = load_data("application_test")[['SK_ID_CURR']]
submit_df['TARGET'] = preds
submit_df.to_csv("submission6.csv", index=False)

Epoch 1/6
 - 22s - loss: 0.2564 - binary_accuracy: 0.9186
Epoch 2/6
 - 21s - loss: 0.2501 - binary_accuracy: 0.9193
Epoch 3/6
 - 21s - loss: 0.2487 - binary_accuracy: 0.9193
Epoch 4/6
 - 21s - loss: 0.2480 - binary_accuracy: 0.9193
Epoch 5/6
 - 21s - loss: 0.2475 - binary_accuracy: 0.9193
Epoch 6/6
 - 21s - loss: 0.2472 - binary_accuracy: 0.9193


In [24]:
# Try one more epoch => 7
best_clf = get_selu_with_final_arch()
best_clf.fit(X, Y, 
             batch_size = 48, 
             epochs = 7, 
             verbose = 2)

preds = best_clf.predict_proba(test_data)
submit_df = load_data("application_test")[['SK_ID_CURR']]
submit_df['TARGET'] = preds
submit_df.to_csv("submission7.csv", index=False)

Epoch 1/7
 - 23s - loss: 0.2538 - binary_accuracy: 0.9191
Epoch 2/7
 - 22s - loss: 0.2504 - binary_accuracy: 0.9192
Epoch 3/7
 - 22s - loss: 0.2495 - binary_accuracy: 0.9192
Epoch 4/7
 - 21s - loss: 0.2487 - binary_accuracy: 0.9192
Epoch 5/7
 - 21s - loss: 0.2479 - binary_accuracy: 0.9191
Epoch 6/7
 - 21s - loss: 0.2470 - binary_accuracy: 0.9193
Epoch 7/7
 - 21s - loss: 0.2469 - binary_accuracy: 0.9193


Seven epochs clearly reach the overfitting zone, as seen below:
    
![NN increased epochs submissions](NN_later_submissions.png)

Using the public scoreboard as validation set, we see that 6 training epochs provide the neural network with the best generalization capability. The final AUC score is a healthy **0.75436**.

# Kaggle submission via the command line API

In [25]:
! kaggle competitions submit -c home-credit-default-risk -f submission7.csv -m "NN 320-320-26 7 epochs"

Successfully submitted to Home Credit Default Risk



  0%|          | 0.00/921k [00:00<?, ?B/s]
 10%|9         | 88.0k/921k [00:00<00:01, 798kB/s]
100%|##########| 921k/921k [00:00<00:00, 961kB/s] 


## report submission

Click on this [link](https://www.kaggle.com/c/home-credit-default-risk/submissions?sortBy=date&group=all&page=1)

![image.png](attachment:image.png)

# References

Some of the material in this notebook has been adopted from [here](https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction/notebook).

The research paper that introduced scaled exponential linear unit activation is available [here](https://arxiv.org/abs/1706.02515).
