In [4]:
import pandas as pd
from pandas import Series, DataFrame
import os
import sys
import numpy as np

In [5]:
import keras
from keras import layers, utils, optimizers, losses, models, callbacks
keras.__version__

Using TensorFlow backend.


'2.1.6'

In [6]:
applicationDF = pd.read_csv(os.path.join('.','data','application_train.csv'))

In [11]:
from tools.modelTools import categoricalColumns, numericalColumns

categoricalColumns = categoricalColumns()
numericalColumns = numericalColumns()

In [12]:
targets = applicationDF["TARGET"]
quant_df = pd.concat([applicationDF[numericalColumns], targets], axis=1)

positive_loans = quant_df[quant_df["TARGET"]==1]
positive_loans.shape[0]

24825

In [13]:
positive_loans = positive_loans.append([positive_loans]*10,ignore_index=True)
positive_loans.shape[0]

273075

In [14]:
negative_loans = quant_df[quant_df["TARGET"]==0]
negative_loans.shape[0]

282686

In [15]:
quant_df = positive_loans.append([negative_loans])

In [16]:
# mix the rows
import csv
quant_df = quant_df.sample(frac=1).reset_index(drop=True)

In [17]:
# fill in NaN
quant_df = quant_df.fillna(quant_df.mean())

In [18]:
# define the validation dataset
validation_set = quant_df.sample(frac=0.2).reset_index(drop=False)
training_set = quant_df.drop(validation_set["index"], axis=0).reset_index(drop=True)
validation_set = validation_set.drop(["index"], axis=1).reset_index(drop=True)

In [19]:
# setup the training dataset
training_targets = training_set["TARGET"]

# drop the old index from the validation dataset
validation_targets = validation_set["TARGET"]

#drop the target columns
training_set = training_set.drop(["TARGET"], axis=1).reset_index(drop=True)
validation_set = validation_set.drop(["TARGET"], axis=1).reset_index(drop=True)

In [20]:
# get the ndarray out of the dataframe
training_set = training_set.values
validation_set = validation_set.values

In [21]:
# onehot for the targets
training_targets = utils.to_categorical(training_targets)
validation_targets_one_hot = utils.to_categorical(validation_targets)

In [22]:
# normalize the training and validation data
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
training_set = min_max_scaler.fit_transform(training_set)
validation_set = min_max_scaler.fit(validation_set)

In [None]:
# define the NN architecture
inputs = layers.Input(shape=(69,), name="inputs")

fc1 = layers.Dense(units=256, activation='relu', name='fc1')(inputs)
fc2 = layers.Dense(units=512, activation='relu', name='fc2')(fc1)
fc3 = layers.Dense(units=512, activation='relu', name='fc3')(fc2)
fc4 = layers.Dense(units=512, activation='relu', name='fc4')(fc3)

outputs = layers.Dense(units=2, activation='softmax', name='outputs')(fc4)
model = models.Model(inputs=[inputs], outputs=[outputs])

In [None]:
# set up the optimizer and the loss function
optimizer = optimizers.Adam(lr=0.0001)
loss = losses.binary_crossentropy
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1)

In [None]:
model.compile(loss=loss, metrics=['accuracy'], optimizer=optimizer)
model.fit(x=training_set, y=training_targets, batch_size=256, validation_split=0.1, callbacks=[early_stopping], epochs=50)

In [None]:
loss_, acc_ = model.evaluate(x=validation_set, y=validation_targets_one_hot)

print("Loss: {}, Accuracy: {}".format(loss_, acc_))

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
y_pred_keras = model.predict(validation_set)
fpr, tpr, thresholds = roc_curve(validation_targets, np.argmax(y_pred_keras, axis=1))
print("AUC: {}".format(roc_auc_score(np.argmax(y_pred_keras, axis=1), validation_targets)))

In [25]:
def create_NN(inputRowCount):
    
    try:
        del model
    except:
        pass
    
    # define the NN architecture
    inputs = layers.Input(shape=(inputRowCount,), name="inputs")

    fc1 = layers.Dense(units=256, activation='relu', name='fc1')(inputs)
    fc2 = layers.Dense(units=512, activation='relu', name='fc2')(fc1)
    fc3 = layers.Dense(units=512, activation='relu', name='fc3')(fc2)
    fc4 = layers.Dense(units=512, activation='relu', name='fc4')(fc3)

    outputs = layers.Dense(units=2, activation='softmax', name='outputs')(fc4)
    model = models.Model(inputs=[inputs], outputs=[outputs])
    return model

def train_NN(X, y, inputRowCount, epochs):
    # set up the optimizer and the loss function
    optimizer = optimizers.Adam(lr=0.0001)
    loss = losses.binary_crossentropy
    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1)
    
    
    model=create_NN(inputRowCount)
    
    model.compile(loss=loss, metrics=['accuracy'], optimizer=optimizer)
    model.fit(x=X, y=y, batch_size=256, validation_split=0.1, callbacks=[early_stopping], epochs=epochs)

In [26]:
train_NN(X=training_set, y=training_targets, inputRowCount=69, epochs=1)

Train on 400148 samples, validate on 44461 samples
Epoch 1/1


In [27]:
for i in range(2):
    train_NN(X=training_set, y=training_targets, inputRowCount=69, epochs=1)

Train on 400148 samples, validate on 44461 samples
Epoch 1/1
Train on 400148 samples, validate on 44461 samples
Epoch 1/1


## Model

In [8]:
N_FEAT = 122
N_UNITS = [1024, 1024, 512]

In [16]:
input_ = layers.Input(shape=(N_FEAT, ), name="Input")

fc_1 = layers.Dense(units=N_UNITS[0], activation='relu', name='fc_1')(input_)
fc_2 = layers.Dense(units=N_UNITS[1], activation='relu', name='fc_2')(fc_1)
fc_3 = layers.Dense(units=N_UNITS[2], activation='relu', name='fc_3')(fc_2)
output = layers.Dense(units=1, activation='softmax', name='output')(fc_3)

model = models.Model(inputs=input_, outputs=output)
model.compile(loss=losses.binary_crossentropy, optimizer='adam')
