# Model selection: <font color='#0041C2'>Neural Network</font>
---
- 1. Model 1 - No SMOTE + no dropping of columns
- 2. Model 2 - SMOTE + no dropping of columns
- 3. Model 3 - SMOTE + dropped columns

# Setting up the notebook

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

from sklearn.metrics import recall_score, fbeta_score, roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV

import tensorflow as tf
tf.autograph.set_verbosity(0)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Recall, AUC, Precision
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow_addons as tfa

tf.get_logger().setLevel("INFO")

 The versions of TensorFlow you are currently using is 2.4.0-rc0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
df_train = pd.read_csv("../dataset/train.csv")

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

In [None]:
def target_encoding(df_x, df_y):
    x = df_x.copy()
    
    # Target Encoding — categorical columns with high cardinality: profession, city, state
    profession_target_enc = TargetEncoder()
    x["profession_encoded"] = profession_target_enc.fit_transform(x["profession"], df_y)
    
    city_target_enc = TargetEncoder()
    x["city_encoded"] = city_target_enc.fit_transform(x["city"], df_y)
    
    state_target_enc = TargetEncoder()
    x["state_encoded"] = state_target_enc.fit_transform(x["state"], df_y)
    
    x.drop("profession", axis=1, inplace=True)
    x.drop("city", axis=1, inplace=True)
    x.drop("state", axis=1, inplace=True)
    return x

scale_features = ['income','age','experience']
x_train = target_encoding(x_train, y_train)
scaler = MinMaxScaler()
x_train[scale_features] = scaler.fit_transform(x_train[scale_features])

 # Model 1 - No SMOTE + no dropping of columns

In [4]:
skf = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)
scores = []

for train, val in skf.split(x_train, y_train):
    model = Sequential()
    model.add(Dense(120, input_shape=(13,), activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=["accuracy", Recall(), Precision(), tfa.metrics.FBetaScore(num_classes=2, beta=2.0), AUC()])

    x_t, y_t = x_train.iloc[train], y_train.iloc[train]
    x_val, y_val = x_train.iloc[val], y_train.iloc[val]

    y_t = to_categorical(y_t, 2)
    y_val = to_categorical(y_val, 2)

    model_fit = model.fit(x_t, y_t, epochs=5, batch_size=64, verbose=0)
    result = model.evaluate(x_val, y_val, verbose=0)
    scores.append(result)
    
loss = [ val[0] for val in scores ]
accuracy = [ val[1] for val in scores ]
recall = [ val[2] for val in scores ]
precision = [ val[3] for val in scores ]
fbeta_2 = [ val[4][0] for val in scores ]
auc = [ val[5] for val in scores ]

loss.append( sum(loss) / len(loss) )
accuracy.append( sum(accuracy) / len(accuracy) )
recall.append( sum(recall) / len(recall) )
precision.append( sum(precision) / len(precision) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[loss, accuracy, recall, precision, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Loss', 'Accuracy', 'Recall', 'Precision', 'Fbeta2', 'AUC'])

display(score_df)

2021-11-12 07:27:33.735073: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-12 07:27:33.823603: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-12 07:27:33.824376: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-12 07:27:33.826293: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Loss,0.370592,0.371189,0.371562,0.371114
Accuracy,0.877009,0.876994,0.876994,0.876999
Recall,0.877009,0.876994,0.876994,0.876999
Precision,0.877009,0.876994,0.876994,0.876999
Fbeta2,0.972717,0.972714,0.972714,0.972715
AUC,0.889712,0.889536,0.887959,0.889069


# Model 2 - SMOTE + no dropping of columns

In [5]:
skf = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)
scores = []

for train, val in skf.split(x_train, y_train):
    model = Sequential()
    model.add(Dense(120, input_shape=(13,), activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=["accuracy", Recall(), Precision(), tfa.metrics.FBetaScore(num_classes=2, beta=2.0), AUC()])

    x_t, y_t = x_train.iloc[train], y_train.iloc[train]
    x_val, y_val = x_train.iloc[val], y_train.iloc[val]

    oversampler = SMOTE(random_state=2021)
    x_t, y_t = oversampler.fit_resample(x_t, y_t)
    
    y_t = to_categorical(y_t, 2)
    y_val = to_categorical(y_val, 2)

    model_fit = model.fit(x_t, y_t, epochs=5, batch_size=64, verbose=0)
    result = model.evaluate(x_val, y_val, verbose=0)
    scores.append(result)
    
loss = [ val[0] for val in scores ]
accuracy = [ val[1] for val in scores ]
recall = [ val[2] for val in scores ]
precision = [ val[3] for val in scores ]
fbeta_2 = [ val[4][0] for val in scores ]
auc = [ val[5] for val in scores ]

loss.append( sum(loss) / len(loss) )
accuracy.append( sum(accuracy) / len(accuracy) )
recall.append( sum(recall) / len(recall) )
precision.append( sum(precision) / len(precision) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[loss, accuracy, recall, precision, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Loss', 'Accuracy', 'Recall', 'Precision', 'Fbeta2', 'AUC'])

display(score_df)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Loss,0.639144,0.655724,0.639206,0.644691
Accuracy,0.604613,0.604137,0.623065,0.610605
Recall,0.604613,0.604137,0.623065,0.610605
Precision,0.604613,0.604137,0.623065,0.610605
Fbeta2,0.646006,0.64764,0.666823,0.653489
AUC,0.667023,0.640978,0.677277,0.661759


# Model 3 - SMOTE + dropped columns

In [6]:
x_train.drop(['current_house_years', 'current_job_years', 'norent_noown', 'owned', 'marital_status'], axis=1, inplace=True)

skf = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)
scores = []

for train, val in skf.split(x_train, y_train):
    model = Sequential()
    model.add(Dense(120, input_shape=(8,), activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=["accuracy", Recall(), Precision(), tfa.metrics.FBetaScore(num_classes=2, beta=2.0), AUC()])

    x_t, y_t = x_train.iloc[train], y_train.iloc[train]
    x_val, y_val = x_train.iloc[val], y_train.iloc[val]

    oversampler = SMOTE(random_state=2021)
    x_t, y_t = oversampler.fit_resample(x_t, y_t)
    
    y_t = to_categorical(y_t, 2)
    y_val = to_categorical(y_val, 2)

    model_fit = model.fit(x_t, y_t, epochs=5, batch_size=64, verbose=0)
    result = model.evaluate(x_val, y_val, verbose=0)
    scores.append(result)
    
loss = [ val[0] for val in scores ]
accuracy = [ val[1] for val in scores ]
recall = [ val[2] for val in scores ]
precision = [ val[3] for val in scores ]
fbeta_2 = [ val[4][0] for val in scores ]
auc = [ val[5] for val in scores ]

loss.append( sum(loss) / len(loss) )
accuracy.append( sum(accuracy) / len(accuracy) )
recall.append( sum(recall) / len(recall) )
precision.append( sum(precision) / len(precision) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[loss, accuracy, recall, precision, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Loss', 'Accuracy', 'Recall', 'Precision', 'Fbeta2', 'AUC'])

display(score_df)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Loss,0.652305,0.654844,0.641181,0.649443
Accuracy,0.59994,0.567277,0.608348,0.591855
Recall,0.59994,0.567277,0.608348,0.591855
Precision,0.59994,0.567277,0.608348,0.591855
Fbeta2,0.645145,0.604652,0.65588,0.635226
AUC,0.656609,0.632255,0.669542,0.652802


#### All results
|           | Model 1  | Model 2  | Model 3  |
|-----------|----------|----------|----------|
| Loss      | 0.371114 | 0.644691 | 0.649443 |
| Accuracy  | 0.876999 | 0.610605 | 0.591855 |
| Recall    | 0.876999 | 0.610605 | 0.591855 |
| Precision | 0.876999 | 0.610605 | 0.591855 |
| Fbeta2    | 0.972715 | 0.653489 | 0.635226 |
| AUC       | 0.889069 | 0.661759 | 0.652802 |