In [1]:
import os, sys
import numpy as np
from matplotlib import pyplot as plt

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder

from keras.models import Sequential, Model
# from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
# from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
#from keras.regularizers import l2, activity_l2, l1, activity_l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam

from sklearn.model_selection import train_test_split

%matplotlib inline

Using TensorFlow backend.


In [2]:
# Init some useful dirs

current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/'

#Set path to sample/ path if desired
test_path = DATA_HOME_DIR + ''
results_path=DATA_HOME_DIR + 'results/'
train_path=DATA_HOME_DIR + ''

## Data

In [3]:
df_train = pd.read_csv(train_path+"application_train.csv")
df_test  = pd.read_csv(test_path+"application_test.csv")

## Feature Engineering
First lets get the subset of columns to encode

In [8]:
# Utility to add missing dummies in test set
def fix_missing_cols(in_train, in_test):
    missing_cols = set( in_train.columns ) - set( in_test.columns )
    # Add a missing column in test set with default value equal to 0
    for c in missing_cols:
        in_test[c] = 0
    # Ensure the order of column in the test set is in the same order than in train set
    in_test = in_test[in_train.columns]
    return in_test

In [9]:
cols = ['SK_ID_CURR',
 'NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE'
]


and now one hot encode. Ideally this would cover all the variations in the underlying data set (not just from the sample)

In [10]:
df_train_enc = pd.get_dummies(df_train[cols])

In [11]:
df_train_enc.shape

(307511, 42)

In [16]:
x_train, x_test, y_train, y_test = train_test_split( df_train_enc, df_train_y, test_size=0.2, random_state=42)

In [17]:
x_train.shape

(246008, 42)

In [18]:
y_train.shape

(246008, 1)

## Model Setup

In [34]:
in_layers = 42
fd_layers = in_layers*2
learning_rate = 0.1

In [37]:
model = Sequential()
model.add(Dense(fd_layers, input_dim=in_layers, activation='relu'))
model.add(Dense(int((fd_layers/2)), activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])

In [21]:
batch_size=64

In [38]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 84)                3612      
_________________________________________________________________
dense_17 (Dense)             (None, 42)                3570      
_________________________________________________________________
batch_normalization_2 (Batch (None, 42)                168       
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 43        
Total params: 7,393
Trainable params: 7,309
Non-trainable params: 84
_________________________________________________________________


In [23]:
from roc_callback import *
roc_cb = roc_callback(training_data=(x_train, y_train),validation_data=(x_test, y_test))

In [39]:
model.fit(x_train, y_train, epochs=10, batch_size=batch_size, callbacks=[roc_cb])

Epoch 1/10
roc-auc: 0.4979 - roc-auc_val: 0.4974                                                                                                    
Epoch 2/10
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 3/10
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 4/10
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 5/10
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 6/10
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 7/10
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                              

<keras.callbacks.History at 0x1dc13248080>

In [40]:
model.optimizer.lr = 0.01

In [41]:
model.fit(x_train, y_train, epochs=5, batch_size=batch_size, callbacks=[roc_cb])

Epoch 1/5
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 2/5
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 3/5
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 4/5
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    
Epoch 5/5
roc-auc: 0.5 - roc-auc_val: 0.5                                                                                                    


<keras.callbacks.History at 0x1dc13a0bc50>