In [46]:
import pandas as pd
import numpy as np
import random
import os

# tf and keras
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras import models
from keras import layers
import keras_tuner as kt
from tensorflow import keras

# plots
import seaborn as sns
import matplotlib.pyplot as plt
 
np.set_printoptions(suppress=True)
pd.options.display.float_format = "{:,.6f}".format

#need this as we have a lot of columns
pd.options.display.max_rows = 1000

#hide warnings 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Load in the dataset

See the amex_random_forest.ipynb file for EDA and more info regarding the processing steps

In [9]:
df = pd.read_feather('train_data.ftr')

In [10]:
df = df.sort_values(by ='S_2')

In [None]:
#drop the ID column and date columns since it wont help out model
df.pop('customer_ID')
df.pop('S_2')

df.head(2)

I need to drop the columns up from above which were predominantly NaNs
 ['D_42', 'D_49', 'D_50', 'D_53', 'D_56', 'S_9', 'B_17', 'D_66', 'D_73', 'D_76', 'D_77', 'R_9', 'D_82', 'B_29', 'D_87', 'D_88', 'D_105', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']

In [13]:
del_cols = ['D_42', 'D_49', 'D_50', 'D_53', 'D_56', 'S_9', 'B_17', 'D_66', 'D_73', 'D_76', 
            'D_77', 'R_9', 'D_82', 'B_29', 'D_87', 'D_88', 'D_105', 'D_106', 'R_26', 'D_108',
            'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']

for i in del_cols: 
    df.pop(i)

Lets check out the new dimensions of our data:

In [14]:
df.shape

(5531451, 158)

In [15]:
cols = df.columns
#new_cols = [i for i in cols if i not in del_cols]

B_38 needs to be reindex to start index 0

In [16]:
B_38_mapping = {label: idx for idx, label in enumerate(np.unique(df['B_38']))}
df['B_38'] = df['B_38'].map(B_38_mapping)

#lets confirm it worked
df['B_38'].value_counts()

1    1953232
2    1255315
0    1160047
4     444856
3     294917
6     259028
5     162040
Name: B_38, dtype: int64

Drop the categoricals with negative values, unclear what these signify

D_117
D_126
D_64

For D_63, this column need to be one hot encoded

In [17]:
df.pop('D_117')
df.pop('D_126')
df.pop('D_64')
display(df.columns.get_loc("D_63"))

45

In [18]:
col = df.columns

In [19]:
%%time
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [45])], remainder='passthrough')
df = columnTransformer.fit_transform(df)

CPU times: user 5.79 s, sys: 10.2 s, total: 16 s
Wall time: 21.1 s


In [20]:
#the onehot encoded cols need to be standardized as well
column_trans = ColumnTransformer([('scaler', StandardScaler(),[0,1,2,3,4,5])],
                                   remainder='passthrough') 
df = column_trans.fit_transform(df)

for all cols which we didnt drop due to our nan thresholds we will impute the mean

In [21]:
%%time 

imp=SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value = -1)
df=pd.DataFrame(imp.fit_transform(df))

CPU times: user 8.02 s, sys: 12 s, total: 20 s
Wall time: 23.4 s


In [22]:
col = list(col)
col.remove('D_63')

In [23]:
one_hot_cols = ['onehot1','onehot2','onehot3','onehot4','onehot5','onehot6']
new_cols = one_hot_cols + col

In [24]:
df.columns = new_cols

In [25]:
X = df
Y = X.pop('target')

In [26]:
#shuffle default is True, we set to false to preserve time series ordering
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle = False)

# 0.25 x 0.8 = 0.2
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25,  shuffle = False) 

### Build the neural network

In [54]:
def nn_model(hp):
    tf.keras.backend.clear_session()
    model = tf.keras.Sequential()
    
    #define out input layer
    model.add(tf.keras.layers.InputLayer(input_shape=(X_train.shape[1],)))
    
    #define range of value of nodes to include in our layers
    hp_units = hp.Int('units', min_value=500, max_value=1200, step=100)    
    
    #add full connected layers
    model.add(tf.keras.layers.Dense(units=hp_units,
                                    name='fc_1', 
                                    activation='relu'))       
              
    #dropout regularization technique, randomly sets nodes = 0 at probabilty set by rate
    model.add(tf.keras.layers.Dropout(rate=0.2))
    
    #add full connected layers
    model.add(tf.keras.layers.Dense(units=hp_units,
                                    name='fc_2', 
                                    activation='relu'))       
              
    #dropout regularization technique, randomly sets nodes = 0 at probabilty set by rate
    model.add(tf.keras.layers.Dropout(rate=0.2))
    
    # add the last fully connected layer
    # this last layer sets the activation function to "None" in order to output the logits 
    # note that passing activation = "sigmoid" will return class memembership probabilities but
    # in TensorFlow logits are prefered for numerical stability
    # set units=1 to get a single output unit (remember it's a binary classification problem)
    model.add(tf.keras.layers.Dense(units=1,
                                    name='output_layer',
                                    activation=None))
    
    #define ranges for learning rates
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    #compile it, with optimizer, loss, and accuracy
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), #set from_ligits=True because our last layer does not apply sigmoid
                  metrics=['accuracy']) 
                        
    return model

Now that we have constructed our model, we can hypertune it using the built in tuner from TF

In [55]:
tuner = kt.Hyperband(nn_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

Create a callback to stop training early after reaching a certain value for the validation loss.



In [56]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

`Tune and find best hyper-parameters`

Run the hyperparameter search. The arguments for the search method are the same as those used for tf.keras.model.fit in addition to the callback above.

In [57]:
tuner.search(X_train, Y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

Trial 30 Complete [07h 19m 21s]
val_accuracy: 0.8371584415435791

Best val_accuracy So Far: 0.876420259475708
Total elapsed time: 3d 20h 34m 34s
INFO:tensorflow:Oracle triggered exit

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 1100 and the optimal learning rate for the optimizer
is 0.0001.



`Train final model with optimal hyperparameters`

Now all we have done is find the best hyperparameter for our neural network. The next step is to pull the best hyperparams and then train our final tuned neural network on that model.

In [58]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, Y_train, epochs=50, validation_data = (X_val,Y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [59]:
#Lets find the best epoch
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Best epoch: 50


Now that we know what the best number of epochs are, we can train our model with that number of epochs

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(img_train, label_train, epochs=best_epoch, validation_data = (X_val,Y_val))

Since we already trained our model on the 50 epochs, we can just use that instead of training it again for 50 epochs. Given 50 epochs provided the best validation accuracy, next time we could increase the epochs

`Evaluate on Test Dataset`

In [62]:
eval_result = model.evaluate(X_test, Y_test)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.30836179852485657, 0.8955708742141724]
