# IS424: Data Mining & Biz Analytics
### Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Model selection: <font color='#0041C2'>Neural Network</font>
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import recall_score, fbeta_score, roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

import tensorflow as tf
tf.autograph.set_verbosity(0)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Recall, AUC, Precision
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow_addons as tfa

tf.get_logger().setLevel("INFO")

 The versions of TensorFlow you are currently using is 2.4.0-rc0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
df_train = pd.read_csv("../dataset/train.csv")

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

In [3]:
def target_encoding(df_x, df_y):
    x = df_x.copy()
    
    # Target Encoding — categorical columns with high cardinality: profession, city, state
    profession_target_enc = TargetEncoder()
    x["profession_encoded"] = profession_target_enc.fit_transform(x["profession"], df_y)
    
    city_target_enc = TargetEncoder()
    x["city_encoded"] = city_target_enc.fit_transform(x["city"], df_y)
    
    state_target_enc = TargetEncoder()
    x["state_encoded"] = state_target_enc.fit_transform(x["state"], df_y)
    
    x.drop("profession", axis=1, inplace=True)
    x.drop("city", axis=1, inplace=True)
    x.drop("state", axis=1, inplace=True)
    return x

scale_features = ['income','age','experience']
x_train = target_encoding(x_train, y_train)

scaler = MinMaxScaler()
x_train[scale_features] = scaler.fit_transform(x_train[scale_features])

  elif pd.api.types.is_categorical(cols):


# 2. Running base model

In [4]:
# https://towardsdatascience.com/simple-guide-to-hyperparameter-tuning-in-neural-networks-3fe03dad8594
# https://medium.com/octavian-ai/which-optimizer-and-learning-rate-should-i-use-for-deep-learning-5acb418f9b2
skf = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

def create_run_model(lr, bs, x_train, y_train, x_val, y_val):
    model = Sequential()
    model.add(Dense(120, input_shape=(9,), activation="relu"))
    model.add(Dense(120, activation="relu"))
    model.add(Dense(120, activation="relu"))
    model.add(Dense(2, activation="softmax"))

    model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(lr=lr), 
                  metrics=["accuracy", Recall(), tfa.metrics.FBetaScore(num_classes=2, beta=2.0), AUC()])

    y_train = to_categorical(y_train, 2)
    y_val = to_categorical(y_val, 2)
    
    es = EarlyStopping(monitor="loss", patience=5, min_delta=0.001)
    model_fit = model.fit(x_train, y_train, epochs=5, batch_size=bs, verbose=0, callbacks=[es])
    result = model.evaluate(x_val, y_val, verbose=0)
    
    return result

# https://stats.stackexchange.com/questions/164876/what-is-the-trade-off-between-batch-size-and-number-of-iterations-to-train-a-neu
# https://towardsdatascience.com/optimizers-for-training-neural-network-59450d71caf6
# https://machinelearningmastery.com/choose-an-activation-function-for-deep-learning/
# https://towardsdatascience.com/simple-guide-to-hyperparameter-tuning-in-neural-networks-3fe03dad8594

learning_rates = [0.01, 0.001]
batch_sizes = [32, 64, 128]
scores = []
variations = []

for train, val in skf.split(x_train, y_train):
    fold_scores = []
    for lr in learning_rates:
        for bs in batch_sizes:
            result = create_run_model(lr, bs, x_train.iloc[train], y_train.iloc[train], x_train.iloc[val], y_train.iloc[val])
            fold_scores.append(result)
            variation = "adam(" + str(lr) + ") bs=" + str(bs)
            if variation not in variations:
                variations.append(variation)
    scores.append(fold_scores)
    
# fold 1 - adam(0.01) 32 | adam(0.01) 64 | adam(0.01) 128 | adam(0.001) 32 | adam(0.001) 64 | adam(0.001) 128
# fold 2 - adam(0.01) 32 | adam(0.01) 64 | adam(0.01) 128 | adam(0.001) 32 | adam(0.001) 64 | adam(0.001) 128
# fold 3 - adam(0.01) 32 | adam(0.01) 64 | adam(0.01) 128 | adam(0.001) 32 | adam(0.001) 64 | adam(0.001) 128

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


2021-11-07 12:58:02.904911: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-11-07 12:58:02.905043: W tensorflow/core/platform/profile_utils/cpu_utils.cc:126] Failed to get CPU frequency: 0 Hz


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export A

In [5]:
print(scores)

[[[0.36189305782318115, 0.8769940733909607, 0.8769940733909607, array([0.9726336, 0.0010582], dtype=float32), 0.9054951071739197], [0.361697793006897, 0.8769196271896362, 0.8769196271896362, array([0.97210294, 0.00798867], dtype=float32), 0.9068682193756104]], [[0.3625982403755188, 0.8769940733909607, 0.8769940733909607, array([0.9727137, 0.       ], dtype=float32), 0.9053547382354736], [0.35540539026260376, 0.8773214221000671, 0.8773214221000671, array([0.9726734 , 0.00498369], dtype=float32), 0.9094765186309814]], [[0.3625231683254242, 0.8769940733909607, 0.8769940733909607, array([0.9727137, 0.       ], dtype=float32), 0.9052985906600952], [0.3568407893180847, 0.8770387172698975, 0.8770387172698975, array([0.9725015 , 0.00377518], dtype=float32), 0.9087041020393372]]]


In [6]:
loss = {}
recall = {}
fbeta_2 = {}
auc = {}

fold_no = 1

for fold in scores:
    temp_l = []
    temp_r = []
    temp_f = []
    temp_a = []
    
    for variation in fold:
        temp_l.append(variation[0])
        temp_r.append(variation[2])
        temp_f.append(variation[3][0])
        temp_a.append(variation[4])

    loss["Fold " + str(fold_no)] = temp_l
    recall["Fold " + str(fold_no)] = temp_r
    fbeta_2["Fold " + str(fold_no)] = temp_f
    auc["Fold " + str(fold_no)] = temp_a
    
    fold_no += 1

print(recall)
print(variations)    
loss["Average"] = []
recall["Average"] = []
fbeta_2["Average"] = []
auc["Average"] = []

for i in range(len(variations)):
    loss["Average"].append((loss["Fold 1"][i] + loss["Fold 2"][i] + loss["Fold 3"][i]) / 3)
    recall["Average"].append((recall["Fold 1"][i] + recall["Fold 2"][i] + recall["Fold 3"][i]) / 3)
    fbeta_2["Average"].append((fbeta_2["Fold 1"][i] + fbeta_2["Fold 2"][i] + fbeta_2["Fold 3"][i]) / 3)
    auc["Average"].append((auc["Fold 1"][i] + auc["Fold 2"][i] + auc["Fold 3"][i]) / 3)
    
print()
print("===RECALL===")
score_df = pd.DataFrame(data=recall, index=variations)
display(score_df)

print("===FBETA===")
score_df = pd.DataFrame(data=fbeta_2, index=variations)
display(score_df)

print("===AUC===")
score_df = pd.DataFrame(data=auc, index=variations)
display(score_df)

{'Fold 1': [0.8769940733909607, 0.8769196271896362], 'Fold 2': [0.8769940733909607, 0.8773214221000671], 'Fold 3': [0.8769940733909607, 0.8770387172698975]}
['adam(0.01) bs=64', 'adam(0.001) bs=64']

===RECALL===


Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
adam(0.01) bs=64,0.876994,0.876994,0.876994,0.876994
adam(0.001) bs=64,0.87692,0.877321,0.877039,0.877093


===FBETA===


Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
adam(0.01) bs=64,0.972634,0.972714,0.972714,0.972687
adam(0.001) bs=64,0.972103,0.972673,0.972502,0.972426


===AUC===


Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
adam(0.01) bs=64,0.905495,0.905355,0.905299,0.905383
adam(0.001) bs=64,0.906868,0.909477,0.908704,0.90835
