### Imports and load the data

In [12]:
from showupforhealth.utils import perform_train_test_split, oversample_with_smote
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 40)
import numpy as np
from imblearn.over_sampling import SMOTE
import time
from tensorflow.keras import Sequential, layers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.metrics import Recall
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import seaborn as sns
import keras_tuner as kt
from showupforhealth.ml_functions.predict import make_predict
from showupforhealth.utils import fit_scaler, transform_data
import datetime
from sklearn.model_selection import train_test_split

# DL model

### Load and shuffle the data

In [13]:
data = pd.read_csv('/Users/alessio/code/janduplessis883/data-showup/data/output-data/full_train_data.csv').sample(frac = 1)

In [14]:
data.head(3)

Unnamed: 0,Appointment_status,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,No_shows,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
592565,1.0,49720704.0,5.6,0.0,71.0,0.0,0.14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,12935.0,0.503645,1.16641,0.0,0.0,10.0,-0.568065,0.822984,-0.965926,-0.258819,-0.5,0.866025,0.0,1.0,14.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
517589,0.0,47485383.0,15.6,0.0,76.0,1.0,0.17,0.0,0.0,0.0,1.0,1.0,0.0,0.0,12666.0,0.150116,1.816196,8.0,0.0,20.0,0.568065,-0.822984,-0.707107,-0.707107,0.5,-0.866025,0.974928,-0.222521,4.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
168958,1.0,27094130.0,21.2,0.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25448.0,0.497369,2.955201,0.0,0.0,14.0,-0.120537,-0.992709,0.707107,-0.707107,-0.5,-0.866025,-0.433884,-0.900969,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
data_DNA = data[data['Appointment_status']==0]
data_DNA.shape

(21909, 38)

In [16]:
data_NOT_DNA = data[data['Appointment_status']==1].sample(21909)
data_NOT_DNA.shape

(21909, 38)

In [17]:
data_balanced = pd.concat([data_DNA, data_NOT_DNA]).sample(frac = 1)
data_balanced['Appointment_status'].value_counts()

1.0    21909
0.0    21909
Name: Appointment_status, dtype: int64

### Split in X and y

In [18]:
X = data_balanced.drop(columns=['Appointment_status', 'Patient ID'])
y = data_balanced['Appointment_status']

### Baseline accuracy

In [19]:
print(f'The baseline accuracy if we assume that all patients will show-up is {y.value_counts()[1] / X.shape[0]}')

The baseline accuracy if we assume that all patients will show-up is 0.5


### Split in Train and Test

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Split Train and Validation

In [21]:
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.09, random_state=42)

### Global Scaler

In [22]:
global_scaler = fit_scaler(X_train_val)

In [23]:
# # Save Scaler
# from pickle import dump
# dump(global_scaler, open('scaler_alex.pkl', 'wb'))

In [24]:
X_train_val_scaled = transform_data(X_train_val, global_scaler)
X_val_scaled = transform_data(X_val, global_scaler)
X_test_scaled = transform_data(X_test, global_scaler)

### Smote

In [25]:
# X_train_smote, y_train_smote = oversample_with_smote(X_train_val_scaled, y_train_val)
# y_train_smote.value_counts()

### Define Metrics

In [28]:
metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),
          tf.keras.metrics.Recall(name='recall'),
          tf.keras.metrics.Precision(name='precision'),
          tf.keras.metrics.AUC(name='auc'),
          tf.keras.metrics.AUC(curve='PR', name='auc-pr')]

# Model

### Model 1

In [47]:
%%time
model_one = Sequential()
es = EarlyStopping(patience=10, monitor='val_accuracy', restore_best_weights=True)

model_one.add(layers.Dense(112, activation='relu', input_dim=X.shape[1]))
model_one.add(layers.Dropout(0.2))
model_one.add(layers.Dense(32, activation='relu'))
model_one.add(layers.Dense(1, activation='sigmoid'))

adam = tf.keras.optimizers.Adam(learning_rate=0.001)
model_one.compile(loss='binary_crossentropy', optimizer=adam, metrics=metrics)

history_one = model_one.fit(X_train_val, y_train_val, validation_data=(X_val, y_val),
                    batch_size=32,
                    epochs=300,
                    shuffle=True,
                    verbose=0,
                    callbacks=[es])

results = model_one.evaluate(X_test_scaled, y_test)
print(f'Model {model_one.metrics_names[0]} is {results[0]}')
print(f'Model {model_one.metrics_names[1]} is {results[1]}')
print(f'Model {model_one.metrics_names[2]} is {results[2]}')
print(f'Model {model_one.metrics_names[3]} is {results[3]}')
print(f'Model {model_one.metrics_names[4]} is {results[4]}')


KeyboardInterrupt



In [48]:
model_one_pred = model_one.predict(X_test_scaled)
model_one_pred = (model_one_pred > 0.5).astype(np.float32)
model_one_pred


KeyboardInterrupt



In [None]:
confusion = confusion_matrix(y_test, model_one_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
plot_loss_precision_recall_curve(history_one)

### Plot curves function

In [None]:
def plot_loss_precision_recall_curve(history):
    fig, ax = plt.subplots(2, 2, figsize=(15, 10))

        # --- ACCURACY 
    ax[1, 0].plot(history.history['accuracy'], color="#a10606")
    ax[1, 0].plot(history.history['val_accuracy'], color="#1b5743")
    ax[1, 0].set_title('Model accuracy', fontsize = 18)
    ax[1, 0].set_ylabel('Accuracy', fontsize = 14)
    ax[1, 0].legend(['Train', 'Val'], loc='upper right')
    ax[1, 0].grid(axis="x", linewidth=0.5)
    ax[1, 0].grid(axis="y", linewidth=0.5)
    
    # --- LOSS 
#     ax[0, 0].plot(history.history['loss'], color="#a10606")
#     ax[0, 0].plot(history.history['val_loss'], color="#1b5743")
#     ax[0, 0].set_title('Model loss', fontsize = 18)
#     ax[0, 0].set_ylabel('Loss', fontsize = 14)
#     ax[0, 0].legend(['Train', 'Val'], loc='upper right')
#     ax[0, 0].grid(axis="x", linewidth=0.5)
#     ax[0, 0].grid(axis="y", linewidth=0.5)

#     --- RECALL
    ax[0, 1].plot(history.history['recall']) 
    ax[0, 1].plot(history.history['val_recall'])
    ax[0, 1].set_title('Model recall', fontsize = 18)
    ax[0, 1].set_ylabel('Recall', fontsize = 14) 
    ax[0, 1].legend(['Train', 'Val'], loc='lower right')
    ax[0, 1].grid(axis="x", linewidth=0.5)
    ax[0, 1].grid(axis="y", linewidth=0.5)

#     --- PRECISION
    ax[0,0].plot(history.history['precision'])
    ax[0,0].plot(history.history['val_precision'])
    ax[0,0].set_title('Model precision', fontsize = 18)
    ax[0,0].set_ylabel('Precision', fontsize = 14)
    ax[0,0].legend(['Train', 'Val'], loc='lower right')
    ax[0,0].grid(axis="x", linewidth=0.5)
    ax[0,0].grid(axis="y", linewidth=0.5)
    
    # --- AUC
    ax[1, 1].plot(history.history['auc'])
    ax[1, 1].plot(history.history['val_auc'])
    ax[1, 1].set_title('Model AUC', fontsize = 18) 
    ax[1, 1].set_ylabel('AUC', fontsize = 14)
    ax[1, 1].legend(['Train', 'Val'], loc='lower right')
    ax[1, 1].grid(axis="x", linewidth=0.5)
    ax[1, 1].grid(axis="y", linewidth=0.5)

    # Set common labels  
    fig.text(0.5, 0.04, 'Epoch', ha='center', va='center', fontsize=14) 
    plt.show()

### Keras tuner Model 1

In [None]:
def model_builder(hp):
    model = Sequential()
    
    hp_units_1 = hp.Int('units_1', min_value=16, max_value=256, step=16)
    hp_units_2 = hp.Int('units_2', min_value=16, max_value=256, step=16)
    hp_units_3 = hp.Int('units_3', min_value=16, max_value=256, step=16)
    hp_units_4 = hp.Int('units_4', min_value=16, max_value=256, step=16)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3])

    model.add(layers.Dense(units=hp_units_1, activation='relu', input_dim=X.shape[1]))
    model.add(layers.Dense(units=hp_units_2, activation='relu'))
    model.add(layers.Dense(units=hp_units_3, activation='relu'))
    model.add(layers.Dense(units=hp_units_4, activation='relu'))
    
    model.add(layers.Dense(1, activation='sigmoid'))

    adam = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=metrics)
    return model

In [None]:
LOG_DIR = f'{int(time.time())}'
tuner_kt_1 = kt.RandomSearch(model_builder,
                     objective=kt.Objective('val_auc', direction="max"),
                     directory=f'/Users/alessio/Desktop/kt-logs/{LOG_DIR}')

stop_early = EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True)
                     
tuner_kt_1.search(X_train_val, y_train_val, validation_data=(X_val, y_val),
             epochs=200,
             callbacks=[stop_early])

In [None]:
best_hps=tuner_kt_1.get_best_hyperparameters(num_trials=1)[0]
best_hps.__dict__

In [None]:
# TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=log_dir)
# EarlyStopping
es = EarlyStopping(patience=50, monitor='val_auc', restore_best_weights=True)

In [None]:
# Build Model with Best hyperparamethers
model_kt_1 = tuner_kt_1.hypermodel.build(best_hps)
# Train model
history_kt_1 = model_kt_1.fit(X_train_val, y_train_val, validation_data=(X_val, y_val),
                    epochs=500,
                    batch_size=128,
                    verbose=0,
                    callbacks=[es, tensorboard])

In [None]:
model_kt_1.evaluate(X_test_scaled, y_test)

In [29]:
# Callbacks
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=log_dir)
es = EarlyStopping(patience=20, monitor='val_auc', restore_best_weights=True)

model_one = Sequential()
model_one.add(layers.Dense(48, activation='relu', input_dim=X.shape[1]))
model_one.add(layers.Dropout(0.3))
model_one.add(layers.Dense(224, activation='relu'))
model_one.add(layers.Dropout(0.2))
model_one.add(layers.Dense(96, activation='relu'))
model_one.add(layers.Dropout(0.2))
model_one.add(layers.Dense(192, activation='relu'))
model_one.add(layers.Dropout(0.2))
model_one.add(layers.Dense(1, activation='sigmoid'))

adam = tf.keras.optimizers.Adam(learning_rate=0.0001)
model_one.compile(loss='binary_crossentropy', optimizer=adam, metrics=metrics)

history_one = model_one.fit(X_train_val, y_train_val, validation_data=(X_val, y_val),
                    batch_size=16,
                    epochs=500,
                    verbose=0,
                    callbacks=[es,tensorboard])

results = model_one.evaluate(X_test_scaled, y_test)
print(f'Model {model_one.metrics_names[0]} is {results[0]}')
print(f'Model {model_one.metrics_names[1]} is {results[1]}')
print(f'Model {model_one.metrics_names[2]} is {results[2]}')
print(f'Model {model_one.metrics_names[3]} is {results[3]}')
print(f'Model {model_one.metrics_names[4]} is {results[4]}')

KeyboardInterrupt: 

In [None]:
# Save model
model_one.save('model_two.h5')

In [None]:
plot_loss_precision_recall_curve(history_one)

In [None]:
# def model_builder_3(hp):
#     model = Sequential()
#     # Hp
#     hp_units_1 = hp.Int('units_1', min_value=32, max_value=128, step=16)
#     hp_units_2 = hp.Int('units_2', min_value=32, max_value=128, step=16)
#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3, 1e-4])

#     model.add(layers.Dense(units=hp_units_1, activation='relu', input_dim=X.shape[1]))
#     model.add(layers.Dropout(0.2))
#     model.add(layers.Dense(units=hp_units_2, activation='relu'))
#     model.add(layers.Dropout(0.2))

#     model.add(layers.Dense(1, activation='sigmoid'))

#     adam = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
#     model.compile(optimizer=adam, loss='binary_crossentropy', metrics=metrics)
#     return model

# Keras Tuner model_3

In [29]:
def model_builder_3(hp):
    model = Sequential()
    
    for i in range(hp.Int("num_layers", 1, 2)):
        model.add(layers.Dense(units=hp.Int(f"units_{i}", min_value=32, max_value=256, step=32),
                               activation=hp.Choice("activation", ["relu", "tanh"])))

    if hp.Boolean("dropout"):
            model.add(layers.Dropout(rate=0.25))
            
    model.add(layers.Dense(1, activation='sigmoid'))

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3, 1e-4])
    adam = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
    
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=metrics)
    return model

In [30]:
LOG_DIR = f'{int(time.time())}'

tuner_kt_3 = kt.RandomSearch(model_builder_3,
                             max_trials=100,
                             objective=kt.Objective('val_auc-pr', direction="max"),
                             directory=f'/Users/alessio/Desktop/kt-logs/{LOG_DIR}',
                             seed=42)

stop_early = EarlyStopping(monitor='val_auc-pr', patience=5, restore_best_weights=True)
                     
tuner_kt_3.search(X_train_val_scaled, y_train_val, validation_data=(X_val_scaled, y_val),
                  epochs=200,
                  batch_size=32,
                  callbacks=[stop_early])

Trial 100 Complete [00h 00m 15s]
val_auc-pr: 0.8949145078659058

Best val_auc-pr So Far: 0.9261295795440674
Total elapsed time: 00h 23m 35s
INFO:tensorflow:Oracle triggered exit


In [31]:
best_hps = tuner_kt_3.get_best_hyperparameters(num_trials=1)[0]
best_hps.__dict__

{'_name_scopes': [],
 '_conditions': [],
 '_hps': defaultdict(list,
             {'num_layers': [Int(name: 'num_layers', min_value: 1, max_value: 2, step: 1, sampling: linear, default: 1)],
              'units_0': [Int(name: 'units_0', min_value: 32, max_value: 256, step: 32, sampling: linear, default: 32)],
              'activation': [Choice(name: 'activation', values: ['relu', 'tanh'], ordered: False, default: relu)],
              'dropout': [Boolean(name: "dropout", default: False)],
              'learning_rate': [Choice(name: 'learning_rate', values: [0.1, 0.01, 0.001, 0.0001], ordered: True, default: 0.1)],
              'units_1': [Int(name: 'units_1', min_value: 32, max_value: 256, step: 32, sampling: linear, default: 32)]}),
 '_space': [Int(name: 'num_layers', min_value: 1, max_value: 2, step: 1, sampling: linear, default: 1),
  Int(name: 'units_0', min_value: 32, max_value: 256, step: 32, sampling: linear, default: 32),
  Choice(name: 'activation', values: ['relu', 'tanh']

In [32]:
tuner_kt_3.results_summary()

Results summary
Results in /Users/alessio/Desktop/kt-logs/1695279724/untitled_project
Showing 10 best trials
Objective(name="val_auc-pr", direction="max")

Trial 036 summary
Hyperparameters:
num_layers: 1
units_0: 192
activation: relu
dropout: False
learning_rate: 0.01
units_1: 96
Score: 0.9261295795440674

Trial 046 summary
Hyperparameters:
num_layers: 1
units_0: 192
activation: relu
dropout: False
learning_rate: 0.01
units_1: 224
Score: 0.9258213043212891

Trial 005 summary
Hyperparameters:
num_layers: 1
units_0: 32
activation: relu
dropout: True
learning_rate: 0.01
units_1: 96
Score: 0.9251797795295715

Trial 058 summary
Hyperparameters:
num_layers: 2
units_0: 160
activation: relu
dropout: True
learning_rate: 0.01
units_1: 256
Score: 0.9242020845413208

Trial 009 summary
Hyperparameters:
num_layers: 2
units_0: 96
activation: relu
dropout: False
learning_rate: 0.01
units_1: 64
Score: 0.9240866303443909

Trial 034 summary
Hyperparameters:
num_layers: 1
units_0: 256
activation: relu
dr

In [34]:
log_dir = "logs/fit/" + 'tuner_kt_3'
tensorboard = TensorBoard(log_dir=log_dir)

es = EarlyStopping(patience=25, monitor='val_accuracy', restore_best_weights=True)

model_kt_3 = tuner_kt_3.hypermodel.build(best_hps)

history_kt_3 = model_kt_3.fit(X_train_val_scaled, y_train_val, validation_data=(X_val_scaled, y_val),
                              epochs=500,
                              batch_size=32,
                              verbose=0,
                              callbacks=[es, tensorboard])

In [36]:
results = model_kt_3.evaluate(X_test_scaled, y_test)
print(f'Model {model_kt_3.metrics_names[0]} is {results[0]}')
print(f'Model {model_kt_3.metrics_names[1]} is {results[1]}')
print(f'Model {model_kt_3.metrics_names[2]} is {results[2]}')
print(f'Model {model_kt_3.metrics_names[3]} is {results[3]}')
print(f'Model {model_kt_3.metrics_names[4]} is {results[4]}')
print(f'Model {model_kt_3.metrics_names[5]} is {results[5]}')

Model loss is 0.3680315911769867
Model accuracy is 0.8331812024116516
Model recall is 0.7908967733383179
Model precision is 0.8679525256156921
Model auc is 0.9081398248672485
Model auc-pr is 0.9268661141395569


In [None]:
# Save model
model_kt_3.save('model_kt_3.h5')

In [27]:
plot_loss_precision_recall_curve(history_kt_3)

NameError: name 'plot_loss_precision_recall_curve' is not defined

# Model_4 - model_3 NO KT

In [96]:
# Callbacks
log_dir = "logs/fit/" + '96_32_001_32'
tensorboard = TensorBoard(log_dir=log_dir)
es = EarlyStopping(patience=30, monitor='val_accuracy', restore_best_weights=True)

model_4 = Sequential()

model_4.add(layers.Dense(96, activation='relu', input_dim=X.shape[1]))

model_4.add(layers.Dense(32, activation='relu'))

model_4.add(layers.Dense(1, activation='sigmoid'))

adam = tf.keras.optimizers.Adam(learning_rate=0.001)
model_4.compile(loss='binary_crossentropy', optimizer=adam, metrics=metrics)

history_one = model_4.fit(X_train_smote, y_train_smote, validation_data=(X_val_scaled, y_val),
                    batch_size=32,
                    epochs=500,
                    verbose=0,
                    shuffle=True,
                    callbacks=[es,tensorboard])

results = model_4.evaluate(X_test_scaled, y_test)
print(f'Model {model_4.metrics_names[0]} is {results[0]}')
print(f'Model {model_4.metrics_names[1]} is {results[1]}')
print(f'Model {model_4.metrics_names[2]} is {results[2]}')
print(f'Model {model_4.metrics_names[3]} is {results[3]}')
print(f'Model {model_4.metrics_names[4]} is {results[4]}')

Model loss is 0.39411988854408264
Model accuracy is 0.8210363984107971
Model recall is 0.774193525314331
Model precision is 0.9063583612442017
Model auc is 0.9010422229766846
