In [None]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedShuffleSplit
import statistics
from oversampling import one_hot, smote_loop

from tqdm import tqdm
from IPython.display import display

from data_processing import prepare_data, split_data
from helper_functions import get_metrics

import tensorflow as tf
from tensorflow.keras import layers, models, preprocessing, Input, metrics, initializers
from tensorflow.keras.metrics import FalsePositives, TruePositives, FalseNegatives, TrueNegatives

from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix

In [None]:
# Get the data and clean it
data = prepare_data('healthcare-dataset-stroke-data.csv')
#data.replace({0: -1})
# Split the data into test, training and validation data
train_data, test_data, val_data, train_labels, test_labels, val_labels = split_data(data, split_size=(0.6, 0.2, 0.2))

In [None]:
# Actual model

def train_and_predict(model, training_data, training_labels, 
                      testing_data, testing_labels, epochs=5, 
                      class_weight=10, verbose=0, plot=True):
    """
    This function trains a given neural network model based on training data and training labels. It then predicts classes on
    training and testing data. 
    It is possible to adjust for how many epochs the model is trained and how to weight the sparse class.
    
    input:
    
    model:        model architecture defined before calling this function
    class_weight:  errors on the stroke class should be weighted heavier then the non-stroke class. 
                  The value defines how much more this loss is weighted. loss_weight=10 means a ratio of 1 to 10.
                  For some reason
    verbose:      0: no text per epoch
                  1: text for each epoch
    plot:         True: show accuracy and loss over epochs in figure
                  False: no plot
                  
    output: 
    
    predictions_train: vector of training predictions
    predictions_test:  vector of test predictions
    history:           dict containing measures over epochs, including loss, accuracy, TP, FP, TN, FN, for train and test data.
                       print history.history for all measures and their keys.
    
    """
   
    # Compile the layers of the model defined earlier. Use the binary cross entropy function as the loss function as we only
    # have 2 output classes and use accuracy as the metric
    model.compile(loss='binary_crossentropy', metrics=['accuracy'])
    
    # set warnings off (annoying bug in tensorflow)
    tf.get_logger().setLevel('ERROR')
    # Train the model for a number of epochs
    history = model.fit(training_data, training_labels, epochs=epochs, 
                        validation_data=(testing_data, testing_labels),
                        class_weight=[{0: 1., 1: class_weight}],
                        #sample_weight=[None],
                        verbose=verbose
                       )
    # set warnings on again 
    tf.get_logger().setLevel('INFO')

    # Predict the classes of the training data
    predictions_train = model.predict(training_data) >= 0.5
    
    # Predict the classes on the testing data
    predictions_test = model.predict(testing_data) >= 0.5
    
    # Plot the loss and accuracy over epochs.
    if plot:
        fig, axs = plt.subplots(1, 2)
        fig.suptitle('loss and accuracy')
    
        axs[0].plot(history.history['accuracy'])
        axs[0].plot(history.history['val_accuracy'])
        axs[0].legend(['train', 'test'], loc='upper left')
        axs[0].set_title('accuracy')
        axs[0].set_ylabel('accuracy')
        axs[0].set_xlabel('epochs')


        # Plot the loss over epochs
        axs[1].plot(history.history['loss'])
        axs[1].plot(history.history['val_loss'])
        axs[1].set_title('loss')
        axs[1].set_xlabel('epochs')
        axs[1].set_ylabel('loss')
        axs[1].legend(['train', 'test'], loc='upper left')

    
    plt.show()

    return predictions_train, predictions_test, history



In [None]:
def get_model():
    # Get the amount of input features for the nodes in the first layer
    input_shape = np.shape(train_data)[1]

    # Create the model
    initializer = initializers.RandomNormal(mean=0.0, stddev=0.05, seed=12345)
    model = models.Sequential()

    # First layer with input nodes equal to features
    model.add(Input(shape=(input_shape)))

    # One hidden layer with 25 nodes
    model.add(layers.Dense(25, activation='relu', kernel_initializer=initializer))

    model.add(layers.Dense(10, activation='relu', kernel_initializer=initializer))

    # Output layer with 1 node (only 1 output class, 0 or 1 for stroke) and sigmoid activation function
    model.add(layers.Dense(1, 'sigmoid'))

    return model

model = get_model()

# Train and predict
predictions_train, predictions_test, history = train_and_predict(model, train_data, 
                                                        train_labels, test_data, test_labels,
                                                        class_weight = 10,
                                                        epochs=30, verbose=0)

# Print metrics
print('train metrics: \n')
accuracy_train, balanced_accuracy_train = get_metrics(train_labels, predictions_train, verbose=True)

print('test metrics: \n')
accuracy_test, balanced_accuracy_test = get_metrics(test_labels, predictions_test, verbose=True)


In [None]:
# Test different class weights and plot accuracy and sensitivity

accuracies_val = []
sensitivities_val = []
balanced_accuracies_val = []
specificities_val = []

for i in tqdm(range(1, 30, 2)):
    
    # Train and predict
    predictions_train, predictions_val, history = train_and_predict(model, train_data, 
                                                          train_labels, val_data, val_labels,
                                                          epochs=30, verbose=0, plot=False,
                                                          class_weight=i)
    
    # metrics
    accuracy_val, balanced_accuracy_val = get_metrics(val_labels, predictions_val, verbose=False)
    conmat = confusion_matrix(test_labels, predictions_val)
    sensitivity = conmat[1,1] / sum(conmat[1,:])
    specificity = conmat[0,0] / sum(conmat[0,:])

    accuracies_val.append(accuracy_val)
    balanced_accuracies_val.append(balanced_accuracy_val)
    sensitivities_val.append(sensitivity)
    specificities_val.append(specificity)
    
## Plot metrics over  class weights.

plt.plot(range(1,30,2), accuracies_val)
plt.plot(range(1,30,2), balanced_accuracies_val)
plt.plot(range(1,30,2), sensitivities_val)
plt.plot(range(1,30,2), specificities_val)
plt.ylim(0, 1)
plt.title('Effect of class weights on different output metrics in validation data')
plt.ylabel('proportions')
plt.xlabel('Relative weight of stroke class')
plt.legend(['accuracy', 'balanced_accuracy', 'sensitivity', 'specificity'])
plt.show()  


In [None]:
# Get the data and clean it
data = prepare_data('healthcare-dataset-stroke-data.csv', one_hot = False, binary = True, normalize = True)

#data.replace({0: -1})
# Split the data into test, training and validation data
train_data, test_data, val_data, train_labels, test_labels, val_labels = split_data(data, split_size=(0.6, 0.2, 0.2))

In [None]:
# Tryout with different oversampling ratios

oversampling_val = []
oversampling_val_bal = []

# Define categorial features
n_features = np.array([True, False, True, True, True, True,True, False, False, True])

print(train_data)

list_data, list_labels, list_ratio = smote_loop(train_data, train_labels, n_features, 0.2, 1.01, 0.2)

for data_res, labels_res in zip(list_data, list_labels):
    
    # Train and predict
    predictions_train, predictions_val, history = train_and_predict(model, data_res, 
                                                          labels_res, val_data, val_labels,
                                                          epochs=30, verbose=0, plot=False,
                                                          class_weight=[{0: 1., 1: 10}])
    
    # metrics
    accuracy_val, balanced_accuracy_val = get_metrics(val_labels, predictions_val, verbose=False)

    oversampling_val.append(accuracy_val)
    oversampling_val_bal.append(balanced_accuracy_val)
    
## Plot metrics over  class weights.

plt.plot(np.linspace(0.2, 1, 5), accuracies_val)
plt.plot(np.linspace(0.2, 1, 5), balanced_accuracies_val)

plt.ylim(0, 1)
plt.title('Effect of class weights on different output metrics in validation data')
plt.ylabel('proportions')
plt.xlabel('Relative weight of stroke class')
plt.legend(['accuracy', 'balanced_accuracy'])
plt.show()  

