# CNN model optimization notebook
Author: Ida Thrane (idth@itu.dk)

In this notebook the best parameters of the CNN model are found.

In [4]:
#Import libraries
import pickle
import pandas as pd

from sklearn.metrics import accuracy_score
from imblearn.metrics import geometric_mean_score
from numpy import mean as mean_np 
from numpy import std as std_np
from numpy import unique, bincount 


from keras import backend as K
from tensorflow.keras import Input, Model
from tensorflow.keras.losses import BinaryFocalCrossentropy
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.random import set_seed
from tensorflow import device
from tensorflow.test import gpu_device_name
device_name = gpu_device_name()

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from collections import defaultdict

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from models import create_cnn_model


### Load training data

In [2]:
with open('data/X_train.pickle', 'rb') as file:
    X_train = pd.read_pickle(file)
    X_train = X_train.values
    
with open('data/y_train.pickle', 'rb') as file:
    y_train = pd.read_pickle(file)
    y_train = y_train.values.ravel()

In [26]:
### Optimise dataset with all types of trees

with open('data/X_train_all.pickle', 'rb') as file:
    X_train_all = pd.read_pickle(file)
    X_train_all = X_train_all.values
    
with open('data/y_train_all.pickle', 'rb') as file:
    y_train_all = pd.read_pickle(file)
    y_train_all = y_train_all.values.ravel()

In [11]:
### Optimise dataset with all types of trees

with open('data/X_train_streets.pickle', 'rb') as file:
    X_train_street = pd.read_pickle(file)
    X_train_street = X_train_street.values
    
with open('data/y_train_streets.pickle', 'rb') as file:
    y_train_street = pd.read_pickle(file)
    y_train_street = y_train_street.values.ravel()

In [13]:
#Instantiate variables

# Set random state
random_state = 1
set_seed(random_state)

#Define inverse class weights
weight_major_class, weight_minor_class = len(y_train) / (len(unique(y_train_street)) * bincount(y_train_street.astype('int64')))

#Define dropout_rate
dropout_rate = 0.3

#Define number of splits
n_splits = 9

#Define input shape
input_shape = X_train_street.shape[1]

#Define alpha as the inverse class weight
alpha = bincount(y_train_street.astype('int64'))[0]/len(y_train_street)

#Define gamma values to optimize
gamma = [0.0, 1., 2.]

#Define learning rate
learning_rate = 0.001

#Define class weight to optimize
class_weight = [{0: 1., 1: 1.}, 
                {0: weight_minor_class, 1: weight_major_class}, 
                {0: weight_major_class, 1: weight_minor_class}]

#Define batch size
batch_size = 32

#Define number of epochs
n_epochs = 10

#Define metrics
metrics = {'accuracy': accuracy_score,
           'g-mean': geometric_mean_score}

#Define refitting metric
refitting_metric = 'g-mean'


### Define functions 

In [3]:
#Define geometric mean evaluation metric
def geometric_mean_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    true_negatives = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    false_positives = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    false_negatives = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))
    specificity = true_negatives / (true_negatives + false_positives + K.epsilon()) #Epsilon added to ensure there is no zero-division
    sensitivity = true_positives / (true_positives + false_negatives + K.epsilon()) #Epsilon added to ensure there is no zero-division
    return K.sqrt(specificity * sensitivity) 

In [16]:
def grid_search(model_build,
                X,
                y, 
                n_splits, 
                input_shape, 
                learning_rate, 
                alpha, 
                gammas, 
                batch_size, 
                n_epochs, 
                scoring, 
                random_state,
                dropout_rate,
                class_weight):
    
    #Define stratified cross validation setup
    skf = StratifiedKFold(n_splits=n_splits, 
                          shuffle=True,
                          random_state=random_state)
    
    #For each class weights and each gamma, run stratified cross validation
    results = defaultdict(list)
    results_fold = defaultdict(list)
    for weight in class_weight:
        print('Class weights:', weight)
        for gamma in gammas:
            print('Gamma:', gamma)
            results_fold = defaultdict(list)
            for i, (train_index, test_index) in enumerate(skf.split(X, y)):
                X_train = X[train_index]
                X_test = X[test_index]
                y_train = y[train_index]
                y_test = y[test_index]

                print('\tFold:', i)
                
                #Create model
                model = create_cnn_model(input_shape,
                                     learning_rate,
                                     loss = BinaryFocalCrossentropy(alpha=alpha, gamma=gamma))
                #Fit model
                hist = model.fit(X_train,
                                 y_train,
                                 steps_per_epoch=(len(train_index)//batch_size),
                                 epochs=n_epochs,
                                 batch_size=batch_size,
                                 verbose=0)
                
                #Predict
                predictions = (model.predict(X_test, verbose=0) > 0.5).astype('int32').reshape(y_test.shape[0])
                
                #Compute scores for this set of parameters
                for scoring_name, scoring_method in scoring.items():
                    results_fold[scoring_name].append(scoring_method(y_test, predictions))
                    results[scoring_name].append(results_fold[scoring_name])
                    print(scoring_name, ": Mean: ", mean_np(results_fold[scoring_name]), "std", std_np(results_fold[scoring_name]))
            
            #Print results
            print("Accuracy Mean: ", mean_np(results["accuracy"]), "std", std_np(results["accuracy"]))
            print("G-mean Mean: ", mean_np(results["g-mean"]), "std", std_np(results["g-mean"]))
        print("Best G-mean Mean: ", max(results["g-mean"]))




In [17]:
#Define parameters to fit in the grid search 
params = {'model_build': create_cnn_model(),
          'X': X_train,
          'y': y_train,
          'n_splits': n_splits,
          'input_shape': input_shape,
          'scoring': metrics,
          'random_state': random_state,
          'learning_rate': learning_rate,
          'alpha': alpha,
          'gammas': gamma,
          'dropout_rate': dropout_rate,
          'class_weight': class_weight,
          'batch_size': batch_size,
          'n_epochs': n_epochs}

### Optimise on original data

In [36]:
#Run grid search on original data
params.update({'X': X_train, 
               'y': y_train})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.9372839031886285 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 1
accuracy : Mean:  0.9371398386477141 std 0.00014406454091436816
g-mean : Mean:  0.0 std 0.0
	Fold: 2
accuracy : Mean:  0.9372178695713732 std 0.0001612886999352156
g-mean : Mean:  0.0 std 0.0
	Fold: 3
accuracy : Mean:  0.9372568850332028 std 0.00015516814212467744
g-mean : Mean:  0.0 std 0.0
	Fold: 4
accuracy : Mean:  0.9372802943103006 std 0.00014647081238696575
g-mean : Mean:  0.0 std 0.0
	Fold: 5
accuracy : Mean:  0.9372959004950324 std 0.0001381877252526533
g-mean : Mean:  0.0 std 0.0
	Fold: 6
accuracy : Mean:  0.9373070477698409 std 0.000130818345895879
g-mean : Mean:  0.0 std 0.0
	Fold: 7
accuracy : Mean:  0.9373154082259474 std 0.00012435248390848337
g-mean : Mean:  0.0 std 0.0
	Fold: 8
accuracy : Mean:  0.937321910802919 std 0.00011867450093859925
g-mean : Mean:  0.0 std 0.0
Accuracy Mean:  0.9373219108029188 std 0.00011867450093859925
G-mean Mea

accuracy : Mean:  0.937361929499713 std 3.1921639726152756e-05
g-mean : Mean:  0.00559043200139166 std 0.013693705845135705
	Fold: 7
accuracy : Mean:  0.9373634297395854 std 3.0122619991911757e-05
g-mean : Mean:  0.0048916280012177024 std 0.012942031197462
	Fold: 8
accuracy : Mean:  0.9373645965928196 std 2.859100329987342e-05
g-mean : Mean:  0.00434811377886018 std 0.012298322953610791
Accuracy Mean:  0.9373472551062912 std 5.9090727193927116e-05
G-mean Mean:  0.0010874172215245613 std 0.006433247462960955
Gamma: 2.0
	Fold: 0
accuracy : Mean:  0.9372839031886285 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 1
accuracy : Mean:  0.9372839031886285 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 2
accuracy : Mean:  0.9373139125986496 std 4.243971465061249e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 3
accuracy : Mean:  0.9373289173036601 std 4.501411503160524e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 4
accuracy : Mean:  0.9373379201266664 std 4.410464522015164e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 5
acc

### Grid search with oversampled data (sampling strategy 0.10, 0.30 and 0.50)

In [43]:
###Sampling strategy 0.10

oversample10 = RandomOverSampler(sampling_strategy=0.10)
over_X_10, over_y_10 = oversample10.fit_resample(X_train, y_train)

In [44]:
params.update({'X': over_X_10, 
               'y': over_y_10})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.9088961341406614 std 0.0
g-mean : Mean:  0.03200429965117076 std 0.0
	Fold: 1
accuracy : Mean:  0.9092221704704239 std 0.0003260363297624691
g-mean : Mean:  0.15133878901200698 std 0.11933448936083622
	Fold: 2
accuracy : Mean:  0.9094550535631113 std 0.00042348025763036464
g-mean : Mean:  0.19944849726585337 std 0.11883981497747904
	Fold: 3
accuracy : Mean:  0.9089892873777363 std 0.0008861806050508078
g-mean : Mean:  0.20597954831605897 std 0.1035381094156749
	Fold: 4
accuracy : Mean:  0.9089706567303214 std 0.0007934993736021348
g-mean : Mean:  0.16478363865284718 std 0.12395371706010207
	Fold: 5
accuracy : Mean:  0.9089892873777363 std 0.0007255594749602957
g-mean : Mean:  0.13731969887737264 std 0.12874435877920112
	Fold: 6
accuracy : Mean:  0.9090159025883292 std 0.0006748937298878835
g-mean : Mean:  0.12227534474963712 std 0.12476070142011633
	Fold: 7
accuracy : Mean:  0.9090242198416394 std 0.000631688681416

accuracy : Mean:  0.9087564042850489 std 0.00023288309268743035
g-mean : Mean:  0.05303503278995485 std 0.05303503278995485
	Fold: 2
accuracy : Mean:  0.9091755938518863 std 0.0006225721985953766
g-mean : Mean:  0.1410340622647599 std 0.1317679836393105
	Fold: 3
accuracy : Mean:  0.9089659990684676 std 0.0006499900203517954
g-mean : Mean:  0.16385084718155804 std 0.12076389425691555
	Fold: 4
accuracy : Mean:  0.9089892873777364 std 0.0005832315171867244
g-mean : Mean:  0.13108067774524643 std 0.1263434626105153
	Fold: 5
accuracy : Mean:  0.9090048129172489 std 0.0005335457313286687
g-mean : Mean:  0.10923389812103869 std 0.12525427878289183
	Fold: 6
accuracy : Mean:  0.9090025949830327 std 0.0004939972371620415
g-mean : Mean:  0.09362905553231889 std 0.12210023565773487
	Fold: 7
accuracy : Mean:  0.908977643223102 std 0.00046678393695453573
g-mean : Mean:  0.08192542359077902 std 0.1183373740987027
	Fold: 8
accuracy : Mean:  0.9090608055281683 std 0.000499004310517167
g-mean : Mean:  0

In [40]:
###Sampling strategy 0.30

oversample30 = RandomOverSampler(sampling_strategy=0.30)
over_X_30, over_y_30 = oversample30.fit_resample(X_train, y_train)

In [41]:
params.update({'X': over_X_30, 
               'y': over_y_30})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.7692125798061007 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 1
accuracy : Mean:  0.793016473555608 std 0.023803893749507365
g-mean : Mean:  0.31272898516692293 std 0.31272898516692293
	Fold: 2
accuracy : Mean:  0.8029741730380179 std 0.024001287390246248
g-mean : Mean:  0.4239303219761692 std 0.2998851225475335
	Fold: 3
accuracy : Mean:  0.8067510049657128 std 0.02179081745878443
g-mean : Mean:  0.48086906554262693 std 0.27780276564640677
	Fold: 4
accuracy : Mean:  0.8087018207614092 std 0.019876984416470365
g-mean : Mean:  0.5050027876521499 std 0.25311903810672287
	Fold: 5
accuracy : Mean:  0.8021202806021911 std 0.023362978810806092
g-mean : Mean:  0.4208356563767916 std 0.2980127001975098
	Fold: 6
accuracy : Mean:  0.804116881132366 std 0.022175926612700174
g-mean : Mean:  0.45801274127027775 std 0.2905460656378703
	Fold: 7
accuracy : Mean:  0.8059296397437764 std 0.021290911499348135
g-mean : Mean:  0.482081550

accuracy : Mean:  0.8169385985654607 std 0.0019311105856388844
g-mean : Mean:  0.6398199469049713 std 0.011190779007248408
	Fold: 2
accuracy : Mean:  0.8192375397388402 std 0.0036133622062142524
g-mean : Mean:  0.6448582767278354 std 0.011586999525357489
	Fold: 3
accuracy : Mean:  0.8185150153700638 std 0.003370224653550632
g-mean : Mean:  0.6401779960571492 std 0.012899961239076295
	Fold: 4
accuracy : Mean:  0.8179238590683375 std 0.003237992350917882
g-mean : Mean:  0.6378728009319099 std 0.012425091478031223
	Fold: 5
accuracy : Mean:  0.8177136701610572 std 0.002993001660132229
g-mean : Mean:  0.6380740938348544 std 0.01135143204013261
	Fold: 6
accuracy : Mean:  0.8172123794281552 std 0.003030856417838116
g-mean : Mean:  0.6433514360664948 std 0.016659807833844666
	Fold: 7
accuracy : Mean:  0.8165506633097415 std 0.0033321024664521924
g-mean : Mean:  0.646666340410519 std 0.017882272872827586
	Fold: 8
accuracy : Mean:  0.8112976134494635 std 0.015186359445474184
g-mean : Mean:  0.57

In [9]:
###Sampling strategy 0.5

oversample50 = RandomOverSampler(sampling_strategy=0.50)
over_X_50, over_y_50 = oversample50.fit_resample(X_train, y_train)

In [38]:
params.update({'X': over_X_50, 
               'y': over_y_50})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.7833868433636177 std 0.0
g-mean : Mean:  0.7130161297827867 std 0.0
	Fold: 1
accuracy : Mean:  0.7823963385477151 std 0.0009905048159027374
g-mean : Mean:  0.7123334885018667 std 0.0006826412809199933
	Fold: 2
accuracy : Mean:  0.7838422478766764 std 0.0021989484082260246
g-mean : Mean:  0.7109838064918849 std 0.0019884539559873763
	Fold: 3
accuracy : Mean:  0.7545426600177607 std 0.05078409275239134
g-mean : Mean:  0.5332378548689136 std 0.30786983518512845
	Fold: 4
accuracy : Mean:  0.7369720157037836 std 0.05742934291614321
g-mean : Mean:  0.4265902838951309 std 0.3483129137886889
	Fold: 5
accuracy : Mean:  0.7447281257612598 std 0.05521980990491594
g-mean : Mean:  0.473866359738734 std 0.33507716772283375
	Fold: 6
accuracy : Mean:  0.749731440959537 std 0.05257206928247495
g-mean : Mean:  0.5042396463123394 std 0.31901786437681234
	Fold: 7
accuracy : Mean:  0.7393511906455794 std 0.05632576529550319
g-mean : Me

accuracy : Mean:  0.7842407268256029 std 0.0015369902315731654
g-mean : Mean:  0.7133957927310872 std 0.0025306474956087732
	Fold: 2
accuracy : Mean:  0.7840471799075529 std 0.0012844506479195025
g-mean : Mean:  0.711027882718537 std 0.0039349011194386154
	Fold: 3
accuracy : Mean:  0.7816278434319284 std 0.0043355423013813205
g-mean : Mean:  0.7050758969065919 std 0.010857761759755367
	Fold: 4
accuracy : Mean:  0.7586401624351178 std 0.04613861129338596
g-mean : Mean:  0.5640607175252735 std 0.2821975124910333
	Fold: 5
accuracy : Mean:  0.763001246625521 std 0.04323275941514026
g-mean : Mean:  0.5883031031487859 std 0.2632514571350007
	Fold: 6
accuracy : Mean:  0.7492424168858574 std 0.05232488556222097
g-mean : Mean:  0.5042598026989593 std 0.3190310295753151
	Fold: 7
accuracy : Mean:  0.7389232945811097 std 0.056045043839259144
g-mean : Mean:  0.4412273273615894 std 0.3418623242373937
	Fold: 8
accuracy : Mean:  0.7441808496047295 std 0.05489241115233799
g-mean : Mean:  0.470752051337

### Grid search with undersampled data

In [46]:
###Sampling strategy 0.10

undersample10 = RandomUnderSampler(sampling_strategy=0.10)
under_X_10, under_y_10 = undersample10.fit_resample(X_train, y_train)

In [47]:
params.update({'X': under_X_10, 
               'y': under_y_10})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.9091289198606272 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 1
accuracy : Mean:  0.9091289198606272 std 0.0
g-mean : Mean:  0.01958001023315848 std 0.01958001023315848
	Fold: 2
accuracy : Mean:  0.909082462253194 std 6.570097850743102e-05
g-mean : Mean:  0.013053340155438986 std 0.01846021068209114
	Fold: 3
accuracy : Mean:  0.9090592334494774 std 6.968641114979413e-05
g-mean : Mean:  0.00979000511657924 std 0.01695678626827451
	Fold: 4
accuracy : Mean:  0.9090706373868469 std 6.637134825565827e-05
g-mean : Mean:  0.01890728107382956 std 0.023717608731931076
	Fold: 5
accuracy : Mean:  0.9088923836787055 std 0.0004031660768905089
g-mean : Mean:  0.07117684760511761 std 0.11886676882183914
	Fold: 6
accuracy : Mean:  0.9091832363508344 std 0.0008042972924485301
g-mean : Mean:  0.09309861570671095 std 0.1224508862957301
	Fold: 7
accuracy : Mean:  0.9091400153865731 std 0.0007609919382436236
g-mean : Mean:  0.08146128874

accuracy : Mean:  0.9088501742160279 std 0.0
g-mean : Mean:  0.0553678935453467 std 0.0
	Fold: 1
accuracy : Mean:  0.9088501742160279 std 0.0
g-mean : Mean:  0.02768394677267335 std 0.02768394677267335
	Fold: 2
accuracy : Mean:  0.9089430894308942 std 0.00013140195701491435
g-mean : Mean:  0.04761505258759882 std 0.0361307610947558
	Fold: 3
accuracy : Mean:  0.9086759581881533 std 0.0004764736700758623
g-mean : Mean:  0.0495362918153112 std 0.03146660775290279
	Fold: 4
accuracy : Mean:  0.9087918956277459 std 0.00048516768876096064
g-mean : Mean:  0.050705159731308035 std 0.02824151078820361
	Fold: 5
accuracy : Mean:  0.9091712037953547 std 0.000956833232729112
g-mean : Mean:  0.08345338359049777 std 0.07763300375908563
	Fold: 6
accuracy : Mean:  0.9092430064153739 std 0.0009031464755174664
g-mean : Mean:  0.10359914390274604 std 0.08718377084838719
	Fold: 7
accuracy : Mean:  0.9093839785365077 std 0.0009234859686363797
g-mean : Mean:  0.12191797027501944 std 0.09486801072821426
	Fold:

In [50]:
###Sampling strategy 0.30

undersample30 = RandomUnderSampler(sampling_strategy=0.30)
under_X_30, under_y_30 = undersample30.fit_resample(X_train, y_train)

In [51]:
params.update({'X': under_X_30, 
               'y': under_y_30})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.7966041740360806 std 0.0
g-mean : Mean:  0.4966119178316434 std 0.0
	Fold: 1
accuracy : Mean:  0.805978068623983 std 0.009373894587902387
g-mean : Mean:  0.562832648283014 std 0.06622073045137067
	Fold: 2
accuracy : Mean:  0.809198872831181 std 0.008906575809232821
g-mean : Mean:  0.5766034283814986 std 0.05746934426327101
	Fold: 3
accuracy : Mean:  0.8079784186007388 std 0.007997739997650612
g-mean : Mean:  0.572130683276154 std 0.0503692415455099
	Fold: 4
accuracy : Mean:  0.8090862026796003 std 0.007488645960220828
g-mean : Mean:  0.5770864382692263 std 0.046129019340339125
	Fold: 5
accuracy : Mean:  0.809942677746093 std 0.0070993612127355915
g-mean : Mean:  0.5802603770130542 std 0.04270372493704748
	Fold: 6
accuracy : Mean:  0.80913901748371 std 0.0068611963773650255
g-mean : Mean:  0.582097132675693 std 0.039791138529455004
	Fold: 7
accuracy : Mean:  0.8085805044171422 std 0.006585975378131908
g-mean : Mean:

accuracy : Mean:  0.805978068623983 std 0.0054828440042448134
g-mean : Mean:  0.5674498068661975 std 0.016365858256309707
	Fold: 2
accuracy : Mean:  0.8099065869146912 std 0.007134953690864399
g-mean : Mean:  0.5830962917958286 std 0.02584929126591461
	Fold: 3
accuracy : Mean:  0.8093938467677594 std 0.006242545995257992
g-mean : Mean:  0.5848981819856865 std 0.022602650706974325
	Fold: 4
accuracy : Mean:  0.8113508877468331 std 0.006818763962583317
g-mean : Mean:  0.5894621977978389 std 0.02218163241331658
	Fold: 5
accuracy : Mean:  0.8117119629548686 std 0.00627679548101392
g-mean : Mean:  0.5870369285837648 std 0.02096259369058589
	Fold: 6
accuracy : Mean:  0.8103016906209054 std 0.006760403912409138
g-mean : Mean:  0.5831854254663272 std 0.021579134129630424
	Fold: 7
accuracy : Mean:  0.8089343614588973 std 0.007285417056524372
g-mean : Mean:  0.5782666652827593 std 0.024016891207350115
	Fold: 8
accuracy : Mean:  0.8099940254723107 std 0.007494192837978339
g-mean : Mean:  0.5778554

In [52]:
###Sampling strategy 0.50

undersample50 = RandomUnderSampler(sampling_strategy=0.50)
under_X_50, under_y_50 = undersample50.fit_resample(X_train, y_train)

In [53]:
params.update({'X': under_X_50, 
               'y': under_y_50})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.7751660705160961 std 0.0
g-mean : Mean:  0.686517791429595 std 0.0
	Fold: 1
accuracy : Mean:  0.7687787429739397 std 0.0063873275421563735
g-mean : Mean:  0.6555109598772333 std 0.03100683155236178
	Fold: 2
accuracy : Mean:  0.768352921137796 std 0.005249884178989089
g-mean : Mean:  0.6526398644609296 std 0.025640504019813385
	Fold: 3
accuracy : Mean:  0.7663515585079204 std 0.005717282575523829
g-mean : Mean:  0.652477196834279 std 0.022207115235050905
	Fold: 4
accuracy : Mean:  0.7660705160960655 std 0.005144491752132133
g-mean : Mean:  0.6543863653397859 std 0.020226331134932572
	Fold: 5
accuracy : Mean:  0.7644353602452734 std 0.0059517646771099315
g-mean : Mean:  0.649872676825996 std 0.021042512032728982
	Fold: 6
accuracy : Mean:  0.7651485091906609 std 0.005780526869203264
g-mean : Mean:  0.6564091252274764 std 0.025216719225557682
	Fold: 7
accuracy : Mean:  0.765236029386409 std 0.005412143569560019
g-mean 

accuracy : Mean:  0.772100153295861 std 0.005620848237097631
g-mean : Mean:  0.6662283996451525 std 0.017397816997083537
	Fold: 2
accuracy : Mean:  0.7669902912621359 std 0.008560607646485505
g-mean : Mean:  0.6507871525584888 std 0.026050980417100036
	Fold: 3
accuracy : Mean:  0.7664793050587634 std 0.007466346037916453
g-mean : Mean:  0.6522999159499727 std 0.022712453516847052
	Fold: 4
accuracy : Mean:  0.7671946857434848 std 0.0068296512811869894
g-mean : Mean:  0.6555701253481214 std 0.021341544317010182
	Fold: 5
accuracy : Mean:  0.7645205246125021 std 0.008638622727291562
g-mean : Mean:  0.6497021752741543 std 0.023488623882252972
	Fold: 6
accuracy : Mean:  0.7655866839652589 std 0.00841339100903724
g-mean : Mean:  0.6577835940472365 std 0.02940671581580485
	Fold: 7
accuracy : Mean:  0.7667697390626487 std 0.008469612758638848
g-mean : Mean:  0.6629304410994799 std 0.03069349784957505
	Fold: 8
accuracy : Mean:  0.7675194772126974 std 0.008262004380108344
g-mean : Mean:  0.666533

The best geometric mean (0.8092443785671555) is obtained using the parameters: Gamma: 1.0, Class weights: 0: 7.95162932790224, 1: 0.5335497096002734. This gives a standard deviation of 0.18674246316704318 in the grid search, and an accuracy of 0.8092443785671555 with a standard deviation of 0.013432320440809655. This is obtained. with an oversampling strategy of 0.3.

### Optimise for all tree types

In [27]:
#Instantiate variables

# Set random state
random_state = 1
set_seed(random_state)

#Define inverse class weights
weight_major_class, weight_minor_class = len(y_train_all) / (len(unique(y_train_all)) * bincount(y_train_all.astype('int64')))

#Define dropout rate
dropout_rate = 0.3

#Define number of splits
n_splits = 9

#Define input shape
input_shape = X_train_all.shape[1]

#Define alpha as inverse class weights
alpha = bincount(y_train_all.astype('int64'))[0]/len(y_train_all), # inverse class weight

#Define gamma values
gamma = [0.0, 1., 2.]

#Define learning rate
learning_rate = 0.001

#Define class weights
class_weight = [{0: 1., 1: 1.}, 
                {0: weight_minor_class, 1: weight_major_class}, 
                {0: weight_major_class, 1: weight_minor_class}]

#Define batch size
batch_size = 32

#Define number of epochs in each split
n_epochs = 10

#Define metrics to optimize for
metrics = {'accuracy': accuracy_score,
           'g-mean': geometric_mean_score}

#Define refitting metric
refitting_metric = 'g-mean'


In [28]:
#Define parameters to fit in the grid search 
params = {'model_build': create_model(),
          'X': X_train_all,
          'y': y_train_all,
          'n_splits': n_splits,
          'input_shape': input_shape,
          'scoring': metrics,
          'random_state': random_state,
          'learning_rate': learning_rate,
          'alpha': alpha,
          'gammas': gamma,
          'dropout_rate': dropout_rate,
          'class_weight': class_weight,
          'batch_size': batch_size,
          'n_epochs': n_epochs}

#Run grid search for data with all tree types
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.8629466000768344 std 0.0
g-mean : Mean:  0.266446666801272 std 0.0
	Fold: 1
accuracy : Mean:  0.8628985785631963 std 4.802151363814122e-05
g-mean : Mean:  0.2638816222667824 std 0.00256504453448958
	Fold: 2
accuracy : Mean:  0.8628461628035371 std 8.385821905280307e-05
g-mean : Mean:  0.24996614919859952 std 0.01979058068202956
	Fold: 3
accuracy : Mean:  0.8630360724916645 std 0.00033685489399876926
g-mean : Mean:  0.23336985455641013 std 0.03346731689798324
	Fold: 4
accuracy : Mean:  0.8631500183045409 std 0.00037777184751779135
g-mean : Mean:  0.22303752337128283 std 0.03637412969620918
	Fold: 5
accuracy : Mean:  0.8627617296264027 std 0.000934219885875542
g-mean : Mean:  0.21578522644518983 std 0.03695326074225334
	Fold: 6
accuracy : Mean:  0.8628685895802911 std 0.0009036592958065104
g-mean : Mean:  0.2184089174378136 std 0.03481046345723063
	Fold: 7
accuracy : Mean:  0.8631648521136643 std 0.001152790322782247

accuracy : Mean:  0.8611217825585863 std 0.0
g-mean : Mean:  0.22489420968312293 std 0.0
	Fold: 1
accuracy : Mean:  0.8616500192086054 std 0.0005282366500191649
g-mean : Mean:  0.2517870039190025 std 0.02689279423587955
	Fold: 2
accuracy : Mean:  0.8620458073179517 std 0.0007066251438695139
g-mean : Mean:  0.23263927439071433 std 0.03486286486677758
	Fold: 3
accuracy : Mean:  0.862699949571645 std 0.001287709433320461
g-mean : Mean:  0.2296912062815683 std 0.030620872403768755
	Fold: 4
accuracy : Mean:  0.8628619095180403 std 0.0011964449693468498
g-mean : Mean:  0.22263696263682958 std 0.03080843512760192
	Fold: 5
accuracy : Mean:  0.8624415954272978 std 0.0014409094853411126
g-mean : Mean:  0.21313660730108067 std 0.03524556927289602
	Fold: 6
accuracy : Mean:  0.8625255800864687 std 0.0013497917635520492
g-mean : Mean:  0.21262570861060448 std 0.03265504479521862
	Fold: 7
accuracy : Mean:  0.8628647188065698 std 0.0015489677464802629
g-mean : Mean:  0.21227628070189447 std 0.03055998

In [9]:
###Sampling strategy 0.5

oversample50 = RandomOverSampler(sampling_strategy=0.50)
over_X_50, over_y_50 = oversample50.fit_resample(X_train_all, y_train_all)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.8605730750760365 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 1
accuracy : Mean:  0.8610799850594952 std 0.0005069099834587032
g-mean : Mean:  0.07814182681945958 std 0.07814182681945958
	Fold: 2
accuracy : Mean:  0.8608729605128046 std 0.0005069748318061798
g-mean : Mean:  0.11309529748444118 std 0.08071092628744306
	Fold: 3
accuracy : Mean:  0.8612230234795875 std 0.000748598592172077
g-mean : Mean:  0.12598131653657368 std 0.0733746460139234
	Fold: 4
accuracy : Mean:  0.8610915457847372 std 0.0007193506924540782
g-mean : Mean:  0.10078505322925894 std 0.08274344494122174
Accuracy Mean:  0.8610915457847373 std 0.0007193506924540782
G-mean Mean:  0.10078505322925896 std 0.08274344494122174
Gamma: 0.5
	Fold: 0
accuracy : Mean:  0.8617469718798356 std 0.0
g-mean : Mean:  0.1449328635602361 std 0.0
	Fold: 1
accuracy : Mean:  0.861720292407022 std 2.6679472813595506e-05
g-mean : Mean:  0.16906576469177825 std 0.02413290

### Optimise for only street features

In [19]:
#Define parameters to fit in the grid search 
params = {'model_build': create_model(),
          'X': X_train_street,
          'y': y_train_street,
          'n_splits': n_splits,
          'input_shape': input_shape,
          'scoring': metrics,
          'random_state': random_state,
          'learning_rate': learning_rate,
          'alpha': alpha,
          'gammas': gamma,
          'dropout_rate': dropout_rate,
          'class_weight': class_weight,
          'batch_size': batch_size,
          'n_epochs': n_epochs}

#Optimise for only street features
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.9372839031886285 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 1
accuracy : Mean:  0.9372839031886285 std 0.0
g-mean : Mean:  0.0 std 0.0
	Fold: 2
accuracy : Mean:  0.9373139125986496 std 4.243971465061249e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 3
accuracy : Mean:  0.9373289173036601 std 4.501411503160524e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 4
accuracy : Mean:  0.9373379201266664 std 4.410464522015164e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 5
accuracy : Mean:  0.9373439220086706 std 4.243971465061249e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 6
accuracy : Mean:  0.937348209067245 std 4.06706086733414e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 7
accuracy : Mean:  0.9373514243611759 std 3.89833671462451e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 8
accuracy : Mean:  0.9373539251453443 std 3.74283102260217e-05
g-mean : Mean:  0.0 std 0.0
Accuracy Mean:  0.9373539251453442 std 3.74283102260217e-05
G-mean Mean:  0.0 std 0.0
Gamma: 1.0


accuracy : Mean:  0.9373289173036601 std 4.501411503160524e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 4
accuracy : Mean:  0.9373379201266664 std 4.410464522015164e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 5
accuracy : Mean:  0.9373439220086706 std 4.243971465061249e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 6
accuracy : Mean:  0.937348209067245 std 4.06706086733414e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 7
accuracy : Mean:  0.9373514243611759 std 3.89833671462451e-05
g-mean : Mean:  0.0 std 0.0
	Fold: 8
accuracy : Mean:  0.9373539251453443 std 3.74283102260217e-05
g-mean : Mean:  0.0 std 0.0
Accuracy Mean:  0.9373539251453444 std 3.7428310226021705e-05
G-mean Mean:  0.0 std 0.0
Best G-mean Mean:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


Because the model predicts all in the negative class, I choose to oversample the data with a strategy of .5, which was the one that worked best for the CNN classifier

In [23]:
###Sampling strategy 0.5

oversample50 = RandomOverSampler(sampling_strategy=0.50)
over_X_50, over_y_50 = oversample50.fit_resample(X_train_street, y_train_street)

In [24]:
params.update({'X': over_X_50, 
               'y': over_y_50})
grid_search_optimise = grid_search(**params)

Class weights: {0: 1.0, 1: 1.0}
Gamma: 0.0
	Fold: 0
accuracy : Mean:  0.7435617186966322 std 0.0
g-mean : Mean:  0.6797631986055201 std 0.0
	Fold: 1
accuracy : Mean:  0.7426053692192089 std 0.0009563494774232906
g-mean : Mean:  0.6746510474874812 std 0.005112151118039032
	Fold: 2
accuracy : Mean:  0.7428103012500854 std 0.0008329048504301898
g-mean : Mean:  0.6675759159748543 std 0.01084148042832291
	Fold: 3
accuracy : Mean:  0.7425199808730105 std 0.0008792926881518964
g-mean : Mean:  0.6628069745821039 std 0.012505266067970873
	Fold: 4
accuracy : Mean:  0.7430117491471036 std 0.0012593127676506337
g-mean : Mean:  0.6672886849582126 std 0.014333466263464475
	Fold: 5
accuracy : Mean:  0.7426450553362994 std 0.0014120477895772762
g-mean : Mean:  0.6680597604306349 std 0.013197714430628425
	Fold: 6
accuracy : Mean:  0.741182733004865 std 0.003813051103077901
g-mean : Mean:  0.6637498981254433 std 0.016147641786794276
	Fold: 7
accuracy : Mean:  0.7406581322591586 std 0.003827320273000274


accuracy : Mean:  0.7424004371883325 std 0.0
g-mean : Mean:  0.6677330715503169 std 0.0
	Fold: 1
accuracy : Mean:  0.7418539517726621 std 0.000546485415670428
g-mean : Mean:  0.6646134006253723 std 0.00311967092494464
	Fold: 2
accuracy : Mean:  0.7423093562857209 std 0.0007835075498820348
g-mean : Mean:  0.6646545533235115 std 0.0025478654220919073
	Fold: 3
accuracy : Mean:  0.7408805246259991 std 0.0025661435649649627
g-mean : Mean:  0.6627861107084919 std 0.003916879762474539
	Fold: 4
accuracy : Mean:  0.7419871085927243 std 0.003188445787286192
g-mean : Mean:  0.6663803284799433 std 0.007996697018983326
	Fold: 5
accuracy : Mean:  0.741597628067376 std 0.0030381404696811195
g-mean : Mean:  0.6668933949139638 std 0.0073895526594266805
	Fold: 6
accuracy : Mean:  0.7406753115950803 std 0.0036077263800654157
g-mean : Mean:  0.6634394270702428 std 0.010880444275250954
	Fold: 7
accuracy : Mean:  0.7403507692128493 std 0.0034822439222851496
g-mean : Mean:  0.6672952274352493 std 0.014410289