# DNN model optimization notebook
Author: Ida Thrane (idth@itu.dk)

In this notebook the best parameters of the DNN model are found.

In [1]:
#Import libraries
import pickle
import pandas as pd

from sklearn.metrics import accuracy_score
from imblearn.metrics import geometric_mean_score
from numpy import mean as mean_np 
from numpy import std as std_np
from numpy import unique, bincount 


from keras import backend as K
from tensorflow.keras import Input, Model
from tensorflow.keras.losses import BinaryFocalCrossentropy
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam

#from tensorflow.keras.optimizers import Adam
from tensorflow.random import set_seed
from tensorflow import device
from tensorflow.test import gpu_device_name
device_name = gpu_device_name()

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from collections import defaultdict

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from models import create_nn_model

2023-08-22 13:28:08.674330: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load data

In [6]:
with open('data/X_train.pickle', 'rb') as file:
    X_train = pd.read_pickle(file)
    X_train = X_train.values
    
with open('data/y_train.pickle', 'rb') as file:
    y_train = pd.read_pickle(file)
    y_train = y_train.values.ravel()

In [8]:
#Instantiate variables

# Set random state
random_state = 1
set_seed(random_state)

#Define inverse class weights
weight_major_class, weight_minor_class = len(y_train) / (len(unique(y_train_street)) * bincount(y_train_street.astype('int64')))

#Define dropout_rate
dropout_rate = 0.3

#Define number of splits
n_splits = 9

#Define input shape
input_shape = X_train_street.shape[1]

#Define alpha as the inverse class weight
alpha = bincount(y_train_street.astype('int64'))[0]/len(y_train_street)

#Define gamma values to optimize
gamma = [0.0, 1., 2.]

#Define learning rate
learning_rate = 0.001

#Define class weight to optimize
class_weight = [{0: 1., 1: 1.}, 
                {0: weight_minor_class, 1: weight_major_class}, 
                {0: weight_major_class, 1: weight_minor_class}]

#Define batch size
batch_size = 32

#Define number of epochs
n_epochs = 10

#Define metrics
metrics = {'accuracy': accuracy_score,
           'g-mean': geometric_mean_score}

#Define refitting metric
refitting_metric = 'g-mean'


### Define functions

In [1]:
#Define geometric mean evaluation metric
def geometric_mean_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    true_negatives = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    false_positives = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    false_negatives = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))
    specificity = true_negatives / (true_negatives + false_positives + K.epsilon()) #Epsilon added to ensure there is no zero-division
    sensitivity = true_positives / (true_positives + false_negatives + K.epsilon()) #Epsilon added to ensure there is no zero-division
    return K.sqrt(specificity * sensitivity) 

In [17]:
def grid_search(model_build,
                X,
                y, 
                n_splits, 
                input_shape, 
                learning_rate, 
                alpha, 
                gammas, 
                batch_size, 
                n_epochs, 
                scoring, 
                random_state,
                dropout_rate,
                class_weight):
    
    #Define stratified cross validation setup
    skf = StratifiedKFold(n_splits=n_splits, 
                          shuffle=True,
                          random_state=random_state)
    
    #For each class weights and each gamma, run stratified cross validation
    results = defaultdict(list)
    results_fold = defaultdict(list)
    for weight in class_weight:
        print('Class weights:', weight)
        for gamma in gammas:
            print('Gamma:', gamma)
            results_fold = defaultdict(list)
            for i, (train_index, test_index) in enumerate(skf.split(X, y)):
                X_train = X[train_index]
                X_test = X[test_index]
                y_train = y[train_index]
                y_test = y[test_index]

                print('\tFold:', i)
                
                #Create model
                model = create_nn_model(input_shape,
                                     learning_rate,
                                     loss = BinaryFocalCrossentropy(alpha=alpha, gamma=gamma))
                #Fit model
                hist = model.fit(X_train,
                                 y_train,
                                 steps_per_epoch=(len(train_index)//batch_size),
                                 epochs=n_epochs,
                                 batch_size=batch_size,
                                 verbose=0)
                
                #Predict
                predictions = (model.predict(X_test, verbose=0) > 0.5).astype('int32').reshape(y_test.shape[0])
                
                #Compute scores for this set of parameters
                for scoring_name, scoring_method in scoring.items():
                    results_fold[scoring_name].append(scoring_method(y_test, predictions))
                    results[scoring_name].append(results_fold[scoring_name])
                    print(scoring_name, ": Mean: ", mean_np(results_fold[scoring_name]), "std", std_np(results_fold[scoring_name]))
            
            #Print results
            print("Accuracy Mean: ", mean_np(results["accuracy"]), "std", std_np(results["accuracy"]))
            print("G-mean Mean: ", mean_np(results["g-mean"]), "std", std_np(results["g-mean"]))
        print("Best G-mean Mean: ", max(results["g-mean"]))




In [18]:
#Define parameters to fit in the grid search 
params = {'model_build': create_nn_model(),
          'X': X_train,
          'y': y_train,
          'n_splits': n_splits,
          'input_shape': input_shape,
          'scoring': metrics,
          'random_state': random_state,
          'learning_rate': learning_rate,
          'alpha': alpha,
          'gammas': gamma,
          'dropout_rate': dropout_rate,
          'class_weight': class_weight,
          'batch_size': batch_size,
          'n_epochs': n_epochs}

### Grid search with oversampled data (sampling strategy 0.50)

In [19]:
###Sampling strategy 0.5

oversample50 = RandomOverSampler(sampling_strategy=0.50)
over_X_50, over_y_50 = oversample50.fit_resample(X_train, y_train)

In [20]:
params.update({'X': over_X_50, 
               'y': over_y_50})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.6666438964410137 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 1
accuracy : Mean:  0.6666438964410137 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 2
accuracy : Mean:  0.6666438964410137 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 3
accuracy : Mean:  0.6666438964410137 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 4
accuracy : Mean:  0.666653004842386 std 1.8216802744674965e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 5
accuracy : Mean:  0.6666590771099675 std 2.1468707920495633e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 6
accuracy : Mean:  0.6666634144439544 std 2.2537448503740865e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 7
accuracy : Mean:  0.6666666674444446 std 2.2771003430843706e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 8
accuracy : Mean:  0.6666691975559369 std 2.2630005149887758e-05
g-mean : Mean:  0.0 std 0.0
Accuracy Mean:  0.6666691975559368 std 2.263000514988776e-05
G-mean Mean:  0.0 std 0.0
Gamma: 1.0
	Fold: 0
accuracy : Mean:  

accuracy : Mean:  0.6666691975559369 std 2.2630005149887758e-05
g-mean : Mean:  0.0 std 0.0
Accuracy Mean:  0.7312078503167977 std 0.055949813267861206
G-mean Mean:  0.4094445941329683 std 0.3547116037157611
Gamma: 1.0
	Fold: 0
accuracy : Mean:  0.7781269212377895 std 0.0
g-mean : Mean:  0.7043511382620055 std 0.0
	Fold: 1
accuracy : Mean:  0.7799713095156773 std 0.001844388277887854
g-mean : Mean:  0.7053133057938648 std 0.0009621675318592371
	Fold: 2
accuracy : Mean:  0.7811553612496299 std 0.0022520662580008633
g-mean : Mean:  0.711391672845603 std 0.008631933132515812
	Fold: 3
accuracy : Mean:  0.7809959696700595 std 0.0019697890368599123
g-mean : Mean:  0.7097419268556078 std 0.00800297992610773
	Fold: 4
accuracy : Mean:  0.7811979234338204 std 0.0018075387604629056
g-mean : Mean:  0.7127292711450675 std 0.00932389691991577
	Fold: 5
accuracy : Mean:  0.7812300862608885 std 0.001651616141653846
g-mean : Mean:  0.7145087652807209 std 0.009395683851560283
	Fold: 6
accuracy : Mean:  0