In [1]:
import os, sys
import numpy as np
from matplotlib import pyplot as plt

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder

from keras.models import Sequential, Model
# from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
# from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
#from keras.regularizers import l2, activity_l2, l1, activity_l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam

from sklearn.model_selection import train_test_split

from pre_process import *

%matplotlib inline

Using TensorFlow backend.


In [2]:
# Init some useful dirs
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir+'/../data/'

## Data

In [3]:
pd.options.display.max_columns = None

In [4]:
cols = ['SK_ID_CURR',
        # Some columns that sound useful!
        'NAME_CONTRACT_TYPE',
        'CODE_GENDER',
        'FLAG_OWN_CAR',
        'FLAG_OWN_REALTY',
        'NAME_TYPE_SUITE',
        'NAME_INCOME_TYPE',
        'NAME_EDUCATION_TYPE',
        'NAME_FAMILY_STATUS',
        'NAME_HOUSING_TYPE',
        'DAYS_REGISTRATION',
        'OWN_CAR_AGE',
        # Positively correlated to the target ( top 10 )
        'DAYS_BIRTH',
        'REGION_RATING_CLIENT_W_CITY',
        'REGION_RATING_CLIENT',
        'DAYS_LAST_PHONE_CHANGE',
        'DAYS_ID_PUBLISH',
        'REG_CITY_NOT_WORK_CITY',
        'FLAG_EMP_PHONE',
        'REG_CITY_NOT_LIVE_CITY',
        'FLAG_DOCUMENT_3',
        # Negative correlated to the target (top 10)
        'ELEVATORS_AVG',
        'REGION_POPULATION_RELATIVE',
        'AMT_GOODS_PRICE',
        'FLOORSMAX_MODE',
        'FLOORSMAX_MEDI',
        'FLOORSMAX_AVG',
        'DAYS_EMPLOYED',
        'EXT_SOURCE_1',
        'EXT_SOURCE_3',
        'EXT_SOURCE_2'
        ]

In [5]:
df_train, df_test, y = load_train_test_data(DATA_HOME_DIR,in_cols=cols) 

In [6]:
df_train.shape

(307511, 31)

### Additional features

In [7]:
df_train, df_test = append_bureau_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)

In [8]:
df_train, df_test = append_previous_applications(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)

In [9]:
#df_train, df_test = append_pos_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)

In [10]:
#df_train, df_test = append_credit_card_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)

In [11]:
#df_train, df_test = append_installments_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)

In [12]:
df_train.shape

(307511, 240)

### Numericals standardizing

In [13]:
numerical_feats = [
            f for f in df_train.columns if df_train[f].dtype == 'float64' or df_train[f].dtype == 'int64'
        ]


In [14]:
df_train_norm = normalize_numericals(df_train, numerical_feats)

In [15]:
df_test_norm = normalize_numericals(df_test, numerical_feats)

In [16]:
df_train_enc, df_test_enc = load_data_dummies(df_train_norm, df_test_norm)

In [17]:
df_train_enc.shape

(307511, 272)

In [18]:
df_test_enc.shape

(48744, 2750)

# Split data
TODO: cross folds

In [19]:
x_train, x_test, y_train, y_test = train_test_split( df_train_enc, y, test_size=0.2, random_state=42)

In [20]:
x_train.shape

(246008, 272)

In [21]:
y_train.shape

(246008,)

## Model Setup
Lets create a simple convolutional model with 2 layers and try to train that. This will be our baseline performance for any convolutional architecture we try 

In [53]:
in_layers = x_train.shape[1]
fd_layers = in_layers*2
learning_rate = 0.1
dropout=0.2
print("layers: %s" % in_layers)

layers: 272


In [26]:
def add_layer(model, layers, initializer="glorot_normal", dropout=None):
    model.add(Dense(fd_layers, activation='tanh', kernel_initializer='glorot_uniform'))
    model.add(BatchNormalization())
    if dropout:
        model.add(Dropout(dropout))


In [54]:
model = Sequential()
model.add(Dense(fd_layers, input_dim=in_layers, activation='relu',
                            kernel_initializer='glorot_normal'))

#add_layer(model, fd_layers)
#add_layer(model, fd_layers*2)
#add_layer(model, fd_layers*4)
add_layer(model, 128)
add_layer(model, 64)
add_layer(model, 32, dropout=dropout)
add_layer(model, 16, dropout=dropout)


model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])

In [28]:
batch_size=64

In [55]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_45 (Dense)             (None, 544)               148512    
_________________________________________________________________
dense_46 (Dense)             (None, 544)               296480    
_________________________________________________________________
batch_normalization_28 (Batc (None, 544)               2176      
_________________________________________________________________
dense_47 (Dense)             (None, 544)               296480    
_________________________________________________________________
batch_normalization_29 (Batc (None, 544)               2176      
_________________________________________________________________
dense_48 (Dense)             (None, 544)               296480    
_________________________________________________________________
batch_normalization_30 (Batc (None, 544)               2176      
__________

In [56]:
from roc_callback import *
roc_cb = roc_callback(training_data=(x_train, y_train),validation_data=(x_test, y_test))

In [57]:
model.fit(x_train, y_train, epochs=10, batch_size=batch_size, callbacks=[roc_cb])

Epoch 1/10
roc-auc: 0.4984 - roc-auc_val: 0.4988                                                                                                    
Epoch 2/10
roc-auc: 0.4981 - roc-auc_val: 0.4988                                                                                                    
Epoch 3/10
roc-auc: 0.4982 - roc-auc_val: 0.4987                                                                                                    
Epoch 4/10

KeyboardInterrupt: 

In [None]:
model.optimizer.lr = 0.01

In [None]:
model.fit(x_train, y_train, epochs=20, batch_size=batch_size, callbacks=[roc_cb])

In [None]:
model.optimizer.lr = 0.001
model.fit(x_train, y_train, epochs=20, batch_size=batch_size, callbacks=[roc_cb])