In [6]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns


%matplotlib inline
%config InlineBackend.figure_format ='retina'

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.metrics import *
from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.preprocessing import MinMaxScaler
from skopt import BayesSearchCV


sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings
warnings.filterwarnings('ignore')

filterwarnings("ignore")

seed = 0


#import keras
#from keras.models import Sequential
#from keras.layers import Dense
#from keras.layers import Dropout
#from keras.wrappers.scikit_learn import KerasClassifier
#from keras.constraints import maxnorm
#from keras.layers import LeakyReLU
#from keras.optimizers import *
#from keras.utils import np_utils
#from keras import regularizers
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt


In [7]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data['BPlabel_encoded'] = data['BPlabel'].map( {'most likely':0,'probable':1, 'least likely':2})
Y = data["BPlabel_encoded"] 
data = data.drop(["BPlabel"],1)

X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

X = MinMaxScaler().fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [11]:
def model_builder(hp):
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=10))
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(keras.layers.Dense(10))
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
    return model

tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(X_train, Y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

Trial 30 Complete [00h 00m 00s]
val_accuracy: 0.3191489279270172

Best val_accuracy So Far: 0.7872340679168701
Total elapsed time: 00h 00m 16s
INFO:tensorflow:Oracle triggered exit


In [12]:
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")



The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 160 and the optimal learning rate for the optimizer
is 0.01.



In [13]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, Y_train, epochs=50, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(X_train, Y_train, epochs=best_epoch, validation_split=0.2)

eval_result = hypermodel.evaluate(X_test, Y_test)
print("[test loss, test accuracy]:", eval_result)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Best epoch: 17
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
[test loss, test accuracy]: [0.6046973466873169, 0.7457627058029175]


In [14]:

inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)


scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted']

nested_cv_results = cross_validate(hypermodel, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
nested_cv_results2 = cross_val_score(hypermodel, X , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")

print( 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
print( 'Accuracy Nested CV Average', np.median(nested_cv_results['test_accuracy']))
print( 'Balanced Accuracy Nested CV Average', np.median(nested_cv_results['test_balanced_accuracy'] ))
print( 'F1 Nested CV Average', np.median(nested_cv_results['test_f1_weighted'] ))
print( 'Precision Nested CV Average', np.median(nested_cv_results['test_precision_weighted'] ))
print( 'Recall Nested CV Average', np.median(nested_cv_results['test_recall_weighted'] ))
hypermodel.fit(X_train, Y_train)
print("Best Parameters: \n{}\n".format(hypermodel.best_params_))
print('Non-nested CV Results:')
y_pred_train = hypermodel.predict(X_train)
y_pred = hypermodel.predict(X_test)
print( 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
print( 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
print( 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
print( 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
print( 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))


TypeError: Cannot clone object '<keras.engine.sequential.Sequential object at 0x7fdc04eb5a60>' (type <class 'keras.engine.sequential.Sequential'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.