In [None]:
###################################################################################################
#
# Perform standart neural network model analysis using sklearn and keras (tensorflow) frameworks
# and perform model analysis 
# 
# Note: This notebook define the model and use oversampled data set to help the model learn from 
#       the imbalanced data
#
###################################################################################################

In [None]:
#######################################################################################
#
# Set parameters for SNN
#

BATCH_SIZE = 1024
EPOCHS = 300 
LEARNING_RATE = 0.001  # ADAM default value: 1e-3

FILENAME = "../visualization/data/icu_cleaned.csv"

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import to_categorical 
from keras.callbacks import TensorBoard
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup

import time
from time import time
import os
import tempfile

In [None]:
# Color for graph

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
#######################################################################################
#
# Data preprocessing
#

In [None]:
# Read dataset

file = FILENAME
df = pd.read_csv(file)

In [None]:
# Specify label and predictors

label = ['icu']

# make sure to place a "age" column in the first element of the predictor list
# predictor = ['age', 'pneumonia', 'diabetes', 'renal_chronic']
predictor = [
    'age',
    'sex',
#     'intubed',
    'pneumonia',
#     'pregnancy',
    'diabetes',
    'copd',
    'asthma',
    'inmsupr',
    'hypertension',
    'other_disease',
    'cardiovascular',
    'obesity',
    'renal_chronic',
    'tobacco'
]

label, predictor

In [None]:
# Review the dataset

print("Dataframe shape: {}".format(df[predictor].shape))
df[predictor][0:] # only 5 rows. You can also use either df[predictor].head() or df[predictor].tail()

In [None]:
# Review imbalanced class column (label)

neg, pos = np.bincount(df['icu'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))

In [None]:
# oversampling

under_df = df.copy()
train_df = under_df.sample(frac=0.70, random_state=1337)
interm_df = under_df.drop(train_df.index)
val_df = interm_df.sample(frac=0.50, random_state=1337) # half of 30%
test_df = interm_df.drop(val_df.index)
print(
    "Using %d samples for training, %d for validation and %d for testing"
    % (len(train_df), len(val_df), len(test_df))
)

df_train_neg = train_df[train_df['icu'] == 0]
df_train_pos = train_df[train_df['icu'] == 1]
print("neg: {}, pos: {}".format(df_train_neg.shape, df_train_pos.shape))

repeat = int((df_train_pos.shape[0]/df_train_neg.shape[0])*100)
print("pos / neg ~= {}".format(repeat))

df_train_pos_oversampled = df_train_pos.copy().sample(frac=1)
for i in range(repeat):
    df_train_pos_oversampled = df_train_pos_oversampled.append(df_train_pos.sample(frac=1))
print("df_train_neg: {}, f_train_pos_oversampled: {}".format(df_train_neg.shape, df_train_pos_oversampled.shape))

df_train_balanced = df_train_neg.append(df_train_pos_oversampled)
print("df_train_balanced: {}".format(df_train_balanced.shape))

train_df = ((df_train_balanced.sample(frac=1)).sample(frac=1)).sample(frac=1)
print("oversampled train_df: {}".format(train_df.shape))

print("label mean after oversampling: {}".format(train_df[label].values.mean()))

In [None]:
# Get numpy n-dimentional array (tensor) from the dataset (pandas' dataframe object)

x_train = train_df[predictor].values
y_train = train_df[label].values

x_test = test_df[predictor].values
y_test = test_df[label].values

x_val = val_df[predictor].values
y_val = val_df[label].values

print("train shape: [features={}, label={}] \ntest shape: [features={}, label={}] \nvalidation shape: [features={}, label={}]".format(x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape))


In [None]:
# Convert dataset using one-hot encoder for categorical columns and standardscaler (mean: 0, std: 1) for numberic columns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)

In [None]:
# Preprocess train_data

scaler_age = StandardScaler().fit(x_train[0:, 0:1])

x_train_age = scaler_age.transform(x_train[0:, 0:1])
x_train_remaining = encoder.fit_transform(x_train[0:, 1:])
# x_train_remaining = x_train[0:, 1:] # 0 or 1

x_train_encoded = np.concatenate((x_train_age, x_train_remaining), axis=1)

print("age column mean: {}, std: {}".format(scaler_age.mean_, scaler_age.scale_))
print("x_train encoded shape: {}".format(x_train_encoded.shape))

In [None]:
# Preprocess test_data

x_test_age = scaler_age.transform(x_test[0:, 0:1])
x_test_remaining = encoder.fit_transform(x_test[0:, 1:])
# x_test_remaining = x_test[0:, 1:]

x_test_encoded = np.concatenate((x_test_age, x_test_remaining), axis=1)

In [None]:
# Preprocess val_data

x_val_age = scaler_age.transform(x_val[0:, 0:1])
x_val_remaining = encoder.fit_transform(x_val[0:, 1:])
# x_val_remaining = x_val[0:, 1:]

x_val_encoded = np.concatenate((x_val_age, x_val_remaining), axis=1)

In [None]:
#######################################################################################
#
# Build ml models
#

In [None]:
# Define metrics for ml models

METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

In [None]:
# Define various ml models e.g., standard neural network and logistic regression

def build_snn_w_adam(input_dim, learning_rate = 1e-3, beta_1 = 0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, 
                     metrics=METRICS, output_bias=None):
    # initialize output bias if specified
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
        
    model = keras.Sequential()
    model.add(layers.Dense(32, activation='relu', input_dim=input_dim))
    model.add(layers.Dropout(0.5))
#     model.add(layers.Dense(16, activation='relu'))
#     model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid', bias_initializer=output_bias))
    
    model.compile(optimizer=keras.optimizers.Adam(lr=learning_rate, 
                                                beta_1=beta_1,
                                                beta_2=beta_2,
                                                epsilon=epsilon,
                                                amsgrad=amsgrad), #'adam',
                    loss=keras.losses.BinaryCrossentropy(from_logits=False),
                    metrics=metrics)

    return model

def build_snn_w_sgd(input_dim, learning_rate = 0.01, momentum=0.01, nesterov=False, metrics=METRICS, output_bias=None):
    # initialize output bias if specified
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
        
    model = keras.Sequential()
    model.add(layers.Dense(32, activation='relu', input_dim=input_dim))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid', bias_initializer=output_bias))

    model.compile(optimizer=keras.optimizers.SGD(
                                        learning_rate=learning_rate, 
                                        momentum=momentum, 
                                        nesterov=nesterov, 
                                        name="SGD"),
                    loss=keras.losses.BinaryCrossentropy(from_logits=False),
                    metrics=metrics)
    
    return model

def build_lr_w_sgd(input_dim, learning_rate = 0.01, momentum=0.01, nesterov=False, metrics=METRICS, output_bias=None):
    # initialize output bias if specified
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
        
    model = keras.Sequential()
    model.add(layers.Dense(1, activation='sigmoid', input_dim=input_dim, bias_initializer=output_bias))

    model.compile(optimizer=keras.optimizers.SGD(
                                        learning_rate=learning_rate, 
                                        momentum=momentum, 
                                        nesterov=nesterov, 
                                        name="SGD"),
                    loss=keras.losses.BinaryCrossentropy(from_logits=False),
                    metrics=metrics)
    
    return model


In [None]:
#######################################################################################
#
# Find correct initial bias, checkpoint the initial weights and confirm whether 
# the bias fix helps or not
#

In [None]:
# Find correct initial bias

initial_bias = np.log([pos/neg]) # pos and neg are calculated previously: 
print(initial_bias)

In [None]:
# Build the model and review the structure

model_b = build_snn_w_adam(input_dim=x_train_encoded.shape[1])
model_b.summary()

In [None]:
# Predict the model with train dataset

model_b.predict(x=x_train_encoded, steps=10)

In [None]:
# Evaluate previous model

results = model_b.evaluate(x=x_train_encoded, y=y_train, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

In [None]:
# Build the model with initial_bias

model_i = build_snn_w_adam(input_dim=x_train_encoded.shape[1], output_bias=initial_bias)
model_i.predict(x=x_train_encoded, steps=10)
results = model_i.evaluate(x=x_train_encoded, y=y_train, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

In [None]:
# Save initial_weights

initial_weights = os.path.join(tempfile.mkdtemp(),'initial_weights')
model_i.save_weights(initial_weights)

In [None]:
# Run models with and without initial bias

model_v = build_snn_w_adam(input_dim=x_train_encoded.shape[1])
model_v.load_weights(initial_weights)
model_v.layers[-1].bias.assign([0.0])
zero_bias_history = model_v.fit(
    x=x_train_encoded, 
    y=y_train,
    validation_data=(x_val_encoded, y_val), 
    epochs=20,
    batch_size=BATCH_SIZE, 
    shuffle=True,
    verbose=0)

model_v = build_snn_w_adam(input_dim=x_train_encoded.shape[1])
model_v.load_weights(initial_weights)
careful_bias_history = model_v.fit(
    x=x_train_encoded, 
    y=y_train,
    validation_data=(x_val_encoded, y_val), 
    epochs=20,
    batch_size=BATCH_SIZE, 
    shuffle=True,
    verbose=0)

In [None]:
# Defint plot_loss

def plot_loss(history, label, n):
    # Use a log scale to show the wide range of values.
    plt.semilogy(history.epoch,  history.history['loss'],
               color=colors[n], label='Train '+label)
    plt.semilogy(history.epoch,  history.history['val_loss'],
          color=colors[n], label='Val '+label,
          linestyle="--")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    plt.legend()

In [None]:
# Confirm the bias fix with plot_loss graphs

plot_loss(zero_bias_history, "Zero Bias", 0)
plot_loss(careful_bias_history, "Careful Bias", 1)

In [None]:
#######################################################################################
#
# Plot functions
# 

In [None]:
def plot_metrics(history):
    metrics =  ['loss', 'auc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.7,1])
#             plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

        plt.legend()


In [None]:
# Define plot_receiver_operating_characteristic function

def plot_roc(name, labels, predictions, **kwargs):
    fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

    plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
    plt.xlabel('False positives [%]')
    plt.ylabel('True positives [%]')
#     plt.xlim([-0.5,20])
#     plt.ylim([80,100.5])
    plt.xlim([-0.5,100.5])
    plt.ylim([40,100.5])
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')

In [None]:
#######################################################################################
#
# Start training by specifying class (e.g., label, y) weights
# 
# Note: we are trying to make the model to pay more attention to under-represented data
#

In [None]:
# Run the model
model = build_snn_w_adam(learning_rate=LEARNING_RATE, input_dim=x_train_encoded.shape[1])
model.load_weights(initial_weights)

history = model.fit(
    x=x_train_encoded, 
    y=y_train,
    validation_data=(x_val_encoded, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE, 
    shuffle=True,
    verbose=1)

In [None]:
plot_metrics(history)

In [None]:
# Define plot_confusion_matrix function

def plot_cm(labels, predictions, p=0.5):
    cm = confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    print('No ICU Correctly Detected (True Negatives): ', cm[0][0])
    print('ICU Incorrectly Detected (False Positives): ', cm[0][1])
    print('No ICU Missed (False Negatives): ', cm[1][0])
    print('ICU Detected (True Positives): ', cm[1][1])
    print('Total ICU: ', np.sum(cm[1]))

In [None]:
#######################################################################################
#
# Evaluate metrics with model_weighted
# 

In [None]:
train_predictions = model.predict(x=x_train_encoded, batch_size=BATCH_SIZE)
test_predictions = model.predict(x=x_test_encoded, batch_size=BATCH_SIZE)

In [None]:
# Evaluate the model on the test dataset

results = model.evaluate(x=x_test_encoded, y=y_test,
                                          batch_size=BATCH_SIZE, verbose=0)

for name, value in zip(model.metrics_names, results):
    print(name, ': ', value)
print()

plot_cm(y_test, test_predictions)

#
# [RESULTS]
# 

In [None]:
# Plot the roc

plot_roc("Train Weighted", y_train, train_predictions, color=colors[1])
plot_roc("Test Weighted", y_test, test_predictions, color=colors[1], linestyle='--')

plt.legend(loc='lower right')

In [None]:
#######################################################################################