# Combined model

In this notebook we will combine the optimized version of the three models we've created. So, the predictions of our k-Nearest Neighbor, Decision tree and Neural Network will be combined to one new prediction. 

In [None]:
# Instal nbimporter to be able to import functions from other notebooks
pip install nbimporter

In [48]:
from data_processing import prepare_data, split_data, one_hot_encode
import numpy as np
import pandas as pd
import nbimporter
from helper_functions import get_metrics

# Import functions for k-nearest neighbors
from kNN_die_wel_opent import split_datatypes, train_and_predict
from oversampling import smote_loop

# Import functions for decision tree
from Decision_tree import resampled_forest

# Import functions for deep neural networks
from deep_neural_network import train_and_predict as train_dNN
from deep_neural_network import get_model

In [49]:
# Load the data normalized
data = prepare_data('healthcare-dataset-stroke-data.csv', one_hot = False, binary = False, normalize = True)

# Split the normalized data into training, testing and validation data
train_data, test_data, val_data, train_labels, test_labels, val_labels = split_data(data, (0.6, 0.2, 0.2))

# Change the data to one-hot encoded data
train_hot = one_hot_encode(train_data)
test_hot = one_hot_encode(test_data)
val_hot = one_hot_encode(val_data)



### k-Nearest Neighbor
The k-Nearest Neighbor model with the best balanced accuracy was trained on only numeric data that was overfitted with a ratio of 0.6. 

In [50]:
# Split test data into numeric and binary data
test_num, test_bin = split_datatypes(test_hot)
val_num, val_bin = split_datatypes(val_hot)

# Get the oversampled data with a oversampling ratio of 0.6
data_list, labels_list, ratio_list = smote_loop(train_data, train_labels, 0.6, 0.7, 0.1)
train_num, train_bin = split_datatypes(data_list[0])

# Predictions using model trained on numerical, oversampled data and euclidean distance metric and 5 neighbors
predict_train_kNN, predict_val_kNN = train_and_predict(train_num, labels_list[0], val_num, 5, "distance", 
                                                          metric='euclidean')
predict_train_kNN, predict_test_kNN = train_and_predict(train_num, labels_list[0], test_num, 5, "distance", 
                                                          metric='euclidean')


### Resampled Forest
The optimal number of splits was around 17 most of the time.

In [51]:
# Create a tuple of the data that gets accepted by the forest function
data_DT_val = (train_hot, train_labels, val_hot, val_labels)
data_DT_test = (train_hot, train_labels, test_hot, test_labels)

# Train the forest on the training data and return a list with predicted labels fror training and testing data
predict_train_DT, predict_val_DT = resampled_forest(data_DT_val, 17)
predict_train_DT, predict_test_DT = resampled_forest(data_DT_test, 17)


### Deep Neural Network

In [52]:
# Create a Deep Neural Network with two hidden layers 
model = get_model(train_hot, hidden_layers=2, nodes=[25, 15], dropout_rate=[0.3, 0.3])

# Get the predictions for the training and testing data using the Deep Neural Network
predictions_train, predictions_val, history = train_dNN(model, train_hot, train_labels, val_hot, val_labels, 
                                                         class_weight = 15, plot=False, batch_size=None, epochs=50, verbose=0)
predictions_train, predictions_test, history = train_dNN(model, train_hot, train_labels, test_hot, test_labels, 
                                                         class_weight = 15, plot=False, batch_size=None, epochs=50, verbose=0)



# Combining the models
The models can be combined in different ways. Considering we started with too few stroke predictions an OR function might be good.

In [54]:
predictions_test = predictions_test.reshape(1022, )
predictions_val = predictions_val.reshape(1022, )

predict_combined_val = (predict_val_kNN + predict_val_DT + predictions_val) >= 1
predict_combined_test = (predict_test_kNN + predict_test_DT + predictions_test) >= 1

print('The accuracy using only k-Nearest Neighbors: ')
test_acc, test_balacc = get_metrics(test_labels, predict_test_kNN, verbose = True)
print('The accuracy using only Resampled Forest: ')
test_acc, test_balacc = get_metrics(test_labels, predict_test_DT, verbose = True)
print('The accuracy using the deep Neural Netwerk: ')
test_acc, test_balacc = get_metrics(test_labels, predictions_test, verbose = True)

print('The accuracy using the combined predictions on validation data: ')
test_acc, test_balacc = get_metrics(val_labels, predict_combined_val, verbose = True)

print('The accuracy using the combined predictions on test data: ')
test_acc, test_balacc = get_metrics(test_labels, predict_combined_test, verbose = True)

The accuracy using only k-Nearest Neighbors: 
accuracy: 82.9746 % 

balanced accuracy: 67.3354 %
sensitivity: 0.5000
specificity: 0.8467 

confusion matrix: 
[[823 149]
 [ 25  25]] 

[["True Negative", "False Positive"] 
 ["False Negative", "True Positive"]] 

The accuracy using only Resampled Forest: 
accuracy: 74.8532 % 

balanced accuracy: 77.2942 %
sensitivity: 0.8000
specificity: 0.7459 

confusion matrix: 
[[725 247]
 [ 10  40]] 

[["True Negative", "False Positive"] 
 ["False Negative", "True Positive"]] 

The accuracy using the deep Neural Netwerk: 
accuracy: 74.9511 % 

balanced accuracy: 72.6029 %
sensitivity: 0.7000
specificity: 0.7521 

confusion matrix: 
[[731 241]
 [ 15  35]] 

[["True Negative", "False Positive"] 
 ["False Negative", "True Positive"]] 

The accuracy using the combined predictions on validation data: 
accuracy: 68.8845 % 

balanced accuracy: 75.1049 %
sensitivity: 0.8200
specificity: 0.6821 

confusion matrix: 
[[663 309]
 [  9  41]] 

[["True Negative", 

In [16]:
combined_test = pd.DataFrame({'knn': predict_test_kNN, 'DT': predict_test_DT, 'NN': predictions_test})

model = get_model(combined, hidden_layers=2, nodes=[25, 15], dropout_rate=[0.3, 0.3])

predictions_train, predictions_test, history = train_dNN(model, combined, test_labels, val_hot, val_labels,
                                                        class_weight = 10, plot=True, batch_size=None,
                                                        epochs=50, verbose=0)

      knn     DT     NN
0       0  False  False
1       0  False  False
2       0  False  False
3       0  False  False
4       0  False  False
...   ...    ...    ...
1017    0  False  False
1018    0  False  False
1019    1   True   True
1020    1  False  False
1021    0  False  False

[1022 rows x 3 columns]
