In [131]:
import os, sys
import numpy as np
from matplotlib import pyplot as plt

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder

from keras.models import Sequential, Model
# from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
# from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.advanced_activations import LeakyReLU, PReLU
#from keras.regularizers import l2, activity_l2, l1, activity_l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam

from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split

from pre_process import *

%matplotlib inline

In [2]:
# Init some useful dirs
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir+'/../data/'

## Data

In [3]:
pd.options.display.max_columns = None

In [4]:
cols = ['SK_ID_CURR',
        # Some columns that sound useful!
        'NAME_CONTRACT_TYPE',
        'CODE_GENDER',
        'FLAG_OWN_CAR',
        'FLAG_OWN_REALTY',
        'NAME_TYPE_SUITE',
        'NAME_INCOME_TYPE',
        'NAME_EDUCATION_TYPE',
        'NAME_FAMILY_STATUS',
        'NAME_HOUSING_TYPE',
        'DAYS_REGISTRATION',
        'OWN_CAR_AGE',
        'CNT_FAM_MEMBERS',
        # Positively correlated to the target ( top 10 )
        'DAYS_BIRTH',
        'REGION_RATING_CLIENT_W_CITY',
        'REGION_RATING_CLIENT',
        'DAYS_LAST_PHONE_CHANGE',
        'DAYS_ID_PUBLISH',
        'REG_CITY_NOT_WORK_CITY',
        'FLAG_EMP_PHONE',
        'REG_CITY_NOT_LIVE_CITY',
        'FLAG_DOCUMENT_3',
        # Negative correlated to the target (top 10)
        'ELEVATORS_AVG',
        'REGION_POPULATION_RELATIVE',
        'AMT_GOODS_PRICE',
        'AMT_INCOME_TOTAL',
        'AMT_CREDIT',
        'AMT_ANNUITY',
        'FLOORSMAX_MODE',
        'FLOORSMAX_MEDI',
        'FLOORSMAX_AVG',
        'DAYS_EMPLOYED',
        'EXT_SOURCE_1',
        'EXT_SOURCE_3',
        'EXT_SOURCE_2'
        ]

In [5]:
df_train_pre, df_test_pre, y = load_train_test_data(DATA_HOME_DIR,in_cols=cols) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [6]:
df_train_pre.shape

(307511, 35)

In [7]:
df_test_pre.shape

(48744, 35)

### Feature Engineering

In [108]:
df_train = load_additional_features(df_train_pre)
df_test = load_additional_features(df_test_pre)
df_train, df_test = load_data_dummies(df_train, df_test)
df_train, df_test = append_credit_card_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
df_train, df_test = append_poly_feature(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
df_train, df_test = append_bureau_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)

In [109]:
#df_train, df_test = append_previous_applications(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
df_train, df_test = append_pos_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)
#df_train, df_test = append_installments_data(in_dir=DATA_HOME_DIR, df_train=df_train, df_test=df_test)

In [110]:
df_train.shape

(307511, 383)

In [111]:
df_test.shape

(48744, 383)

Now calculate the class imbalance

In [136]:
clazz_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)
clazz_weights

array([0.54390914, 6.19357503])

### Numericals standardizing

In [112]:
numerical_feats = [
            f for f in df_train.columns if df_train[f].dtype == 'float64' or df_train[f].dtype == 'int64'
        ]


In [113]:
df_train_norm = normalize_numericals(df_train, numerical_feats)

In [114]:
df_test_norm = normalize_numericals(df_test, numerical_feats)

# Split data
TODO: cross folds

In [115]:
x_train, x_test, y_train, y_test = train_test_split( df_train_norm, y, test_size=0.2, random_state=42)

In [116]:
x_train.shape

(246008, 383)

In [117]:
y_train.shape

(246008,)

## Model Setup
Lets create a simple convolutional model with 2 layers and try to train that. This will be our baseline performance for any convolutional architecture we try 

In [118]:
in_units = x_train.shape[1]
fd_units = in_units*2
learning_rate = 0.01

In [119]:
def add_layer(in_units):
    model.add(Dense(in_units, activation='linear'))
    model.add(LeakyReLU(alpha=.001))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))

In [120]:
model = Sequential()
model.add(Dense(fd_units, input_dim=in_units, activation='linear'))
model.add(LeakyReLU(alpha=.001))

add_layer(fd_units*2)
add_layer(int(fd_units/2))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])

In [121]:
batch_size=256

In [122]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_58 (Dense)             (None, 766)               294144    
_________________________________________________________________
leaky_re_lu_47 (LeakyReLU)   (None, 766)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 1532)              1175044   
_________________________________________________________________
leaky_re_lu_48 (LeakyReLU)   (None, 1532)              0         
_________________________________________________________________
batch_normalization_36 (Batc (None, 1532)              6128      
_________________________________________________________________
dropout_7 (Dropout)          (None, 1532)              0         
_________________________________________________________________
dense_60 (Dense)             (None, 383)               587139    
__________

In [123]:
from roc_callback import *
roc_cb = roc_callback(training_data=(x_train, y_train),validation_data=(x_test, y_test))

Fit the model but weight the loss function as we have an imbalanced data set. If we dont do this we optimise to the point of just predicting the positive cases and the ROC metric would converge to .5 (ie garbage:)

In [137]:
model.fit(x_train, y_train, epochs=100, batch_size=batch_size, callbacks=[roc_cb], validation_data=(x_test, y_test),
         class_weight=clazz_weights)

Train on 246008 samples, validate on 61503 samples
Epoch 1/100
roc-auc: 0.6288 - roc-auc_val: 0.628                                                                                                    
Epoch 2/100
roc-auc: 0.6277 - roc-auc_val: 0.6272                                                                                                    
Epoch 3/100
roc-auc: 0.6278 - roc-auc_val: 0.6271                                                                                                    
Epoch 4/100
roc-auc: 0.6287 - roc-auc_val: 0.627                                                                                                    
Epoch 5/100


KeyboardInterrupt: 

In [138]:
model.save_weights('m0_lr01.hdf5')

In [None]:
model.optimizer.lr = 0.001
model.fit(x_train, y_train, epochs=100, batch_size=batch_size, callbacks=[roc_cb], validation_data=(x_test, y_test))

Model 2

In [144]:
model = Sequential()
model.add(Dense(fd_units, input_dim=in_units, activation='linear'))
model.add(LeakyReLU(alpha=.001))

add_layer(fd_units*4)
add_layer(fd_units*4)
add_layer(int(fd_units/2))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])

In [145]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_74 (Dense)             (None, 766)               294144    
_________________________________________________________________
leaky_re_lu_60 (LeakyReLU)   (None, 766)               0         
_________________________________________________________________
dense_75 (Dense)             (None, 3064)              2350088   
_________________________________________________________________
leaky_re_lu_61 (LeakyReLU)   (None, 3064)              0         
_________________________________________________________________
batch_normalization_46 (Batc (None, 3064)              12256     
_________________________________________________________________
dropout_17 (Dropout)         (None, 3064)              0         
_________________________________________________________________
dense_76 (Dense)             (None, 3064)              9391160   
__________

In [None]:
model.optimizer.lr = 0.01
model.fit(x_train, y_train, epochs=10, batch_size=batch_size, callbacks=[roc_cb], validation_data=(x_test, y_test))

Train on 246008 samples, validate on 61503 samples
Epoch 1/10
roc-auc: 0.6031 - roc-auc_val: 0.6028                                                                                                    
Epoch 2/10
roc-auc: 0.6236 - roc-auc_val: 0.6255                                                                                                    
Epoch 3/10
roc-auc: 0.626 - roc-auc_val: 0.6272                                                                                                    
Epoch 4/10

In [None]:
model.save_weights('m2_lr01.hdf5')

Model 3

In [None]:
model = Sequential()
model.add(Dense(fd_units, input_dim=in_layers, activation='linear'))
model.add(LeakyReLU(alpha=.001))

add_layer(fd_units*16)
add_layer(fd_units*8)
add_layer(fd_units*8)
add_layer(fd_units*4)
add_layer(fd_units*2)
add_layer(int(fd_units/2))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])

In [None]:
model.save_weights('m3_lr01.hdf5')

In [None]:
model.optimizer.lr = 0.01
model.fit(x_train, y_train, epochs=100, batch_size=batch_size, callbacks=[roc_cb], validation_data=(x_test, y_test))

In [None]:
model.optimizer.lr = 0.001
model.fit(x_train, y_train, epochs=100, batch_size=batch_size, callbacks=[roc_cb], validation_data=(x_test, y_test))

In [None]:
model.save_weights('m3_lr001.hdf5')