In [1]:
import os, sys
import numpy as np
from matplotlib import pyplot as plt

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder

from keras.models import Sequential, Model
# from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
# from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
#from keras.regularizers import l2, activity_l2, l1, activity_l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam

from sklearn.model_selection import train_test_split

from pre_process import *

%matplotlib inline

Using TensorFlow backend.


In [2]:
# Init some useful dirs
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir+'/../data/'

## Data

In [3]:
cols = ['SK_ID_CURR',
        # Some columns that sound useful!
        'NAME_CONTRACT_TYPE',
        'CODE_GENDER',
        'FLAG_OWN_CAR',
        'FLAG_OWN_REALTY',
        'NAME_TYPE_SUITE',
        'NAME_INCOME_TYPE',
        'NAME_EDUCATION_TYPE',
        'NAME_FAMILY_STATUS',
        'NAME_HOUSING_TYPE',
        'DAYS_REGISTRATION',
        'OWN_CAR_AGE',
        # Positively correlated to the target ( top 10 )
        'DAYS_BIRTH',
        'REGION_RATING_CLIENT_W_CITY',
        'REGION_RATING_CLIENT',
        'DAYS_LAST_PHONE_CHANGE',
        'DAYS_ID_PUBLISH',
        'REG_CITY_NOT_WORK_CITY',
        'FLAG_EMP_PHONE',
        'REG_CITY_NOT_LIVE_CITY',
        'FLAG_DOCUMENT_3',
        # Negative correlated to the target (top 10)
        'ELEVATORS_AVG',
        'REGION_POPULATION_RELATIVE',
        'AMT_GOODS_PRICE',
        'FLOORSMAX_MODE',
        'FLOORSMAX_MEDI',
        'FLOORSMAX_AVG',
        'DAYS_EMPLOYED',
        'EXT_SOURCE_1',
        'EXT_SOURCE_3',
        'EXT_SOURCE_2'
        ]

In [4]:
df_train, df_test, y = load_train_test_data(DATA_HOME_DIR,in_cols=cols) 

In [5]:
df_train.shape

(307511, 31)

First grab the numericals to standardize

In [6]:
numerical_feats = [
            f for f in df_train.columns if df_train[f].dtype == 'float64' or df_train[f].dtype == 'int64'
        ]


In [7]:
df_train_norm = normalize_numericals(df_train, numerical_feats)

In [8]:
df_test_norm = normalize_numericals(df_test, numerical_feats)

In [9]:
df_train_enc, df_test_enc = load_data_dummies(df_train_norm, df_test_norm)

In [10]:
df_train_enc.shape

(307511, 63)

In [11]:
pd.options.display.max_columns = None
df_train.sample(5)

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE
6334,Revolving loans,F,Y,Y,Unaccompanied,Working,Higher education,Married,House / apartment
157555,Revolving loans,F,Y,Y,"Spouse, partner",Working,Secondary / secondary special,Married,House / apartment
110492,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment
47523,Cash loans,F,N,N,Unaccompanied,Commercial associate,Higher education,Married,House / apartment
289566,Cash loans,M,Y,N,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment


# Split data
TODO: cross folds

In [12]:
x_train, x_test, y_train, y_test = train_test_split( df_train_enc, y, test_size=0.2, random_state=42)

In [13]:
x_train.shape

(246008, 63)

In [14]:
y_train.shape

(246008,)

## Model Setup
Lets create a simple convolutional model with 2 layers and try to train that. This will be our baseline performance for any convolutional architecture we try 

In [15]:
in_layers = 63
fd_layers = in_layers*2
learning_rate = 0.1

In [None]:
model = Sequential()
model.add(Dense(fd_layers, input_dim=in_layers, activation='relu'))
model.add(Dense(int((fd_layers/2)), activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])

In [17]:
batch_size=64

In [None]:
model.summary()

In [20]:
from roc_callback import *
roc_cb = roc_callback(training_data=(x_train, y_train),validation_data=(x_test, y_test))

In [21]:
model.fit(x_train, y_train, epochs=10, batch_size=batch_size, callbacks=[roc_cb])

Epoch 1/10
roc-auc: 0.6205 - roc-auc_val: 0.6193                                                                                                    
Epoch 2/10
roc-auc: 0.6272 - roc-auc_val: 0.6295                                                                                                    
Epoch 3/10
roc-auc: 0.606 - roc-auc_val: 0.6053                                                                                                    
Epoch 4/10
roc-auc: 0.632 - roc-auc_val: 0.6314                                                                                                    
Epoch 5/10
roc-auc: 0.6281 - roc-auc_val: 0.6268                                                                                                    
Epoch 6/10
roc-auc: 0.6359 - roc-auc_val: 0.636                                                                                                    
Epoch 7/10
roc-auc: 0.6332 - roc-auc_val: 0.6327                                                             

<keras.callbacks.History at 0x2c8d702d940>

In [22]:
model.optimizer.lr = 0.01

In [23]:
model.fit(x_train, y_train, epochs=5, batch_size=batch_size, callbacks=[roc_cb])

Epoch 1/5
roc-auc: 0.6381 - roc-auc_val: 0.6352                                                                                                    
Epoch 2/5
roc-auc: 0.6408 - roc-auc_val: 0.6348                                                                                                    
Epoch 3/5
roc-auc: 0.6388 - roc-auc_val: 0.6342                                                                                                    
Epoch 4/5
roc-auc: 0.6044 - roc-auc_val: 0.5945                                                                                                    
Epoch 5/5
roc-auc: 0.6385 - roc-auc_val: 0.6345                                                                                                    


<keras.callbacks.History at 0x2c8cee52748>