### Imports and load the data

In [1]:
from showupforhealth.utils import perform_train_test_split, oversample_with_smote
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 40)
import numpy as np
from imblearn.over_sampling import SMOTE
import time
from tensorflow.keras import Sequential, layers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.metrics import Recall
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import seaborn as sns
from sklearn.metrics import f1_score
import keras_tuner as kt
from showupforhealth.ml_functions.predict import make_predict
from showupforhealth.utils import fit_scaler, transform_data
import datetime
from sklearn.model_selection import train_test_split

2023-09-19 20:25:07.491623: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# DL model

### Load and shuffle the data

In [2]:
data = pd.read_csv('/Users/alessio/code/janduplessis883/data-showup/data/output-data/full_train_data.csv').sample(frac = 1)

In [3]:
data.head(3)

Unnamed: 0,Appointment_status,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,No_shows,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
52098,1.0,32188625.0,14.6,0.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17632.0,0.876295,0.255347,0.0,1.0,5.0,-1.0,-1.83697e-16,1.224647e-16,-1.0,-1.0,-1.83697e-16,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
231476,1.0,49855764.0,12.5,0.1,50.0,0.0,0.08,1.0,1.0,0.0,0.0,0.0,0.0,0.0,23221.0,0.371939,0.369845,19.0,0.0,32.0,0.885456,-0.4647232,-0.5,-0.866025,0.866025,-0.5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
311950,1.0,9425015.0,12.4,0.0,53.0,1.0,0.08,0.0,0.0,0.0,1.0,0.0,0.0,0.0,11753.0,0.601035,0.629445,0.0,1.0,17.0,-1.0,-1.83697e-16,0.5,-0.866025,-1.0,-1.83697e-16,0.974928,-0.222521,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
data_DNA = data[data['Appointment_status']==0]
data_DNA.shape

(21909, 38)

In [5]:
data_NOT_DNA = data[data['Appointment_status']==1][:50000]
data_NOT_DNA.shape

(50000, 38)

In [6]:
data_balanced = pd.concat([data_DNA, data_NOT_DNA]).sample(frac = 1)
data_balanced['Appointment_status'].value_counts()

1.0    50000
0.0    21909
Name: Appointment_status, dtype: int64

In [7]:
data_balanced.head(3)

Unnamed: 0,Appointment_status,Patient ID,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,No_shows,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
408982,1.0,47302899.0,9.4,0.0,45.0,1.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23591.0,0.194314,0.728367,10.0,0.0,32.0,0.822984,0.568065,-0.5,-0.866025,0.8660254,0.5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
215812,1.0,17373733.0,14.6,0.0,48.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25185.0,0.50497,0.624106,0.0,0.0,5.0,-0.992709,-0.120537,0.707107,-0.707107,-1.0,-1.83697e-16,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
503779,0.0,47457567.0,8.6,0.0,73.0,0.0,0.22,1.0,0.0,0.0,0.0,1.0,1.0,1.0,16091.0,0.771894,0.857578,1.0,0.0,54.0,-0.464723,0.885456,-0.707107,-0.707107,-2.449294e-16,1.0,-0.433884,-0.900969,13.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Split in X and y

In [8]:
X = data_balanced.drop(columns=['Appointment_status', 'Patient ID'])
y = data_balanced['Appointment_status']

### Baseline accuracy

In [9]:
print(f'The baseline accuracy if we assume that all patients will show-up is {y.value_counts()[1] / X.shape[0]}')

The baseline accuracy if we assume that all patients will show-up is 0.6953232557816129


### Split in Train and Test

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Global Scaler

In [11]:
# Fit Scaler
global_scaler = fit_scaler(X_train)

In [12]:
# # Save Scaler
# from pickle import dump
# dump(global_scaler, open('scaler_alex.pkl', 'wb'))

In [13]:
# Transform with Scale
X_train_scaled = transform_data(X_train, global_scaler)
X_test_scaled = transform_data(X_test, global_scaler)

### Smote

In [14]:
# X_train_smote, y_train_smote = oversample_with_smote(X_train_scaled, y_train)

In [15]:
# y_train_smote.value_counts()

### Split Train and Validation

In [16]:
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train_scaled, y_train, test_size=0.09, random_state=42)

### Define Metrics

In [17]:
metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),
          tf.keras.metrics.Recall(name='recall'),
          tf.keras.metrics.Precision(name='precision'),
          tf.keras.metrics.AUC(name='auc')]

2023-09-19 20:25:39.017297: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Model

### Model 1

In [47]:
%%time
model_one = Sequential()
es = EarlyStopping(patience=10, monitor='val_accuracy', restore_best_weights=True)

model_one.add(layers.Dense(112, activation='relu', input_dim=X.shape[1]))
model_one.add(layers.Dropout(0.2))
model_one.add(layers.Dense(32, activation='relu'))
model_one.add(layers.Dense(1, activation='sigmoid'))

adam = tf.keras.optimizers.Adam(learning_rate=0.001)
model_one.compile(loss='binary_crossentropy', optimizer=adam, metrics=metrics)

history_one = model_one.fit(X_train_val, y_train_val, validation_data=(X_val, y_val),
                    batch_size=32,
                    epochs=300,
                    shuffle=True,
                    verbose=0,
                    callbacks=[es])

results = model_one.evaluate(X_test_scaled, y_test)
print(f'Model {model_one.metrics_names[0]} is {results[0]}')
print(f'Model {model_one.metrics_names[1]} is {results[1]}')
print(f'Model {model_one.metrics_names[2]} is {results[2]}')
print(f'Model {model_one.metrics_names[3]} is {results[3]}')
print(f'Model {model_one.metrics_names[4]} is {results[4]}')


KeyboardInterrupt



In [48]:
model_one_pred = model_one.predict(X_test_scaled)
model_one_pred = (model_one_pred > 0.5).astype(np.float32)
model_one_pred


KeyboardInterrupt



In [None]:
confusion = confusion_matrix(y_test, model_one_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
plot_loss_precision_recall_curve(history_one)

### Plot curves function

In [None]:
def plot_loss_precision_recall_curve(history):
    fig, ax = plt.subplots(2, 2, figsize=(15, 10))

        # --- ACCURACY 
    ax[1, 0].plot(history.history['accuracy'], color="#a10606")
    ax[1, 0].plot(history.history['val_accuracy'], color="#1b5743")
    ax[1, 0].set_title('Model accuracy', fontsize = 18)
    ax[1, 0].set_ylabel('Accuracy', fontsize = 14)
    ax[1, 0].legend(['Train', 'Val'], loc='upper right')
    ax[1, 0].grid(axis="x", linewidth=0.5)
    ax[1, 0].grid(axis="y", linewidth=0.5)
    
    # --- LOSS 
#     ax[0, 0].plot(history.history['loss'], color="#a10606")
#     ax[0, 0].plot(history.history['val_loss'], color="#1b5743")
#     ax[0, 0].set_title('Model loss', fontsize = 18)
#     ax[0, 0].set_ylabel('Loss', fontsize = 14)
#     ax[0, 0].legend(['Train', 'Val'], loc='upper right')
#     ax[0, 0].grid(axis="x", linewidth=0.5)
#     ax[0, 0].grid(axis="y", linewidth=0.5)

#     --- RECALL
    ax[0, 1].plot(history.history['recall']) 
    ax[0, 1].plot(history.history['val_recall'])
    ax[0, 1].set_title('Model recall', fontsize = 18)
    ax[0, 1].set_ylabel('Recall', fontsize = 14) 
    ax[0, 1].legend(['Train', 'Val'], loc='lower right')
    ax[0, 1].grid(axis="x", linewidth=0.5)
    ax[0, 1].grid(axis="y", linewidth=0.5)

#     --- PRECISION
    ax[0,0].plot(history.history['precision'])
    ax[0,0].plot(history.history['val_precision'])
    ax[0,0].set_title('Model precision', fontsize = 18)
    ax[0,0].set_ylabel('Precision', fontsize = 14)
    ax[0,0].legend(['Train', 'Val'], loc='lower right')
    ax[0,0].grid(axis="x", linewidth=0.5)
    ax[0,0].grid(axis="y", linewidth=0.5)
    
    # --- AUC
    ax[1, 1].plot(history.history['auc'])
    ax[1, 1].plot(history.history['val_auc'])
    ax[1, 1].set_title('Model AUC', fontsize = 18) 
    ax[1, 1].set_ylabel('AUC', fontsize = 14)
    ax[1, 1].legend(['Train', 'Val'], loc='lower right')
    ax[1, 1].grid(axis="x", linewidth=0.5)
    ax[1, 1].grid(axis="y", linewidth=0.5)

    # Set common labels  
    fig.text(0.5, 0.04, 'Epoch', ha='center', va='center', fontsize=14) 
    plt.show()

### Keras tuner Model 1

In [None]:
def model_builder(hp):
    model = Sequential()
    
    hp_units_1 = hp.Int('units_1', min_value=16, max_value=256, step=16)
    hp_units_2 = hp.Int('units_2', min_value=16, max_value=256, step=16)
    hp_units_3 = hp.Int('units_3', min_value=16, max_value=256, step=16)
    hp_units_4 = hp.Int('units_4', min_value=16, max_value=256, step=16)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3])

    model.add(layers.Dense(units=hp_units_1, activation='relu', input_dim=X.shape[1]))
    model.add(layers.Dense(units=hp_units_2, activation='relu'))
    model.add(layers.Dense(units=hp_units_3, activation='relu'))
    model.add(layers.Dense(units=hp_units_4, activation='relu'))
    
    model.add(layers.Dense(1, activation='sigmoid'))

    adam = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=metrics)
    return model

In [None]:
LOG_DIR = f'{int(time.time())}'
tuner_kt_1 = kt.RandomSearch(model_builder,
                     objective=kt.Objective('val_auc', direction="max"),
                     directory=f'/Users/alessio/Desktop/kt-logs/{LOG_DIR}')

stop_early = EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True)
                     
tuner_kt_1.search(X_train_val, y_train_val, validation_data=(X_val, y_val),
             epochs=200,
             callbacks=[stop_early])

In [None]:
best_hps=tuner_kt_1.get_best_hyperparameters(num_trials=1)[0]
best_hps.__dict__

In [None]:
# TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=log_dir)
# EarlyStopping
es = EarlyStopping(patience=50, monitor='val_auc', restore_best_weights=True)

In [None]:
# Build Model with Best hyperparamethers
model_kt_1 = tuner_kt_1.hypermodel.build(best_hps)
# Train model
history_kt_1 = model_kt_1.fit(X_train_val, y_train_val, validation_data=(X_val, y_val),
                    epochs=500,
                    batch_size=128,
                    verbose=0,
                    callbacks=[es, tensorboard])

In [None]:
model_kt_1.evaluate(X_test_scaled, y_test)

In [29]:
# Callbacks
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=log_dir)
es = EarlyStopping(patience=20, monitor='val_auc', restore_best_weights=True)

model_one = Sequential()
model_one.add(layers.Dense(48, activation='relu', input_dim=X.shape[1]))
model_one.add(layers.Dropout(0.3))
model_one.add(layers.Dense(224, activation='relu'))
model_one.add(layers.Dropout(0.2))
model_one.add(layers.Dense(96, activation='relu'))
model_one.add(layers.Dropout(0.2))
model_one.add(layers.Dense(192, activation='relu'))
model_one.add(layers.Dropout(0.2))
model_one.add(layers.Dense(1, activation='sigmoid'))

adam = tf.keras.optimizers.Adam(learning_rate=0.0001)
model_one.compile(loss='binary_crossentropy', optimizer=adam, metrics=metrics)

history_one = model_one.fit(X_train_val, y_train_val, validation_data=(X_val, y_val),
                    batch_size=16,
                    epochs=500,
                    verbose=0,
                    callbacks=[es,tensorboard])

results = model_one.evaluate(X_test_scaled, y_test)
print(f'Model {model_one.metrics_names[0]} is {results[0]}')
print(f'Model {model_one.metrics_names[1]} is {results[1]}')
print(f'Model {model_one.metrics_names[2]} is {results[2]}')
print(f'Model {model_one.metrics_names[3]} is {results[3]}')
print(f'Model {model_one.metrics_names[4]} is {results[4]}')

KeyboardInterrupt: 

In [None]:
# Save model
model_one.save('model_two.h5')

In [None]:
plot_loss_precision_recall_curve(history_one)

# Keras Tuner model_3

In [18]:
def model_builder_3(hp):
    model = Sequential()
    # Hp
    hp_units_1 = hp.Int('units_1', min_value=8, max_value=64, step=8)
    hp_units_2 = hp.Int('units_2', min_value=8, max_value=64, step=8)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3, 1e-4])

    model.add(layers.Dense(units=hp_units_1, activation='relu', input_dim=X.shape[1]))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(units=hp_units_2, activation='relu'))
    model.add(layers.Dropout(0.2))

    model.add(layers.Dense(1, activation='sigmoid'))

    adam = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=metrics)
    return model

In [23]:
LOG_DIR = f'{int(time.time())}'

tuner_kt_3 = kt.Hyperband(model_builder_3,
                          objective=kt.Objective('val_accuracy', direction="max"),
                          directory=f'/Users/alessio/Desktop/kt-logs/{LOG_DIR}')

stop_early = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
                     
tuner_kt_3.search(X_train_val, y_train_val, validation_data=(X_val, y_val),
                  epochs=200,
                  callbacks=[stop_early])

Trial 169 Complete [00h 00m 18s]
val_accuracy: 0.7464377880096436

Best val_accuracy So Far: 0.8363948464393616
Total elapsed time: 00h 35m 59s

Search: Running Trial #170

Value             |Best Value So Far |Hyperparameter
24                |8                 |units_1
32                |56                |units_2
0.1               |0.01              |learning_rate
4                 |34                |tuner/epochs
0                 |12                |tuner/initial_epoch
3                 |4                 |tuner/bracket
0                 |3                 |tuner/round

Epoch 1/4
Epoch 2/4


KeyboardInterrupt



In [20]:
best_hps = tuner_kt_3.get_best_hyperparameters(num_trials=1)[0]
best_hps.__dict__

{'_name_scopes': [],
 '_conditions': [],
 '_hps': defaultdict(list,
             {'units_1': [Int(name: 'units_1', min_value: 16, max_value: 256, step: 16, sampling: linear, default: 16)],
              'units_2': [Int(name: 'units_2', min_value: 16, max_value: 256, step: 16, sampling: linear, default: 16)],
              'units_3': [Int(name: 'units_3', min_value: 16, max_value: 256, step: 16, sampling: linear, default: 16)],
              'units_4': [Int(name: 'units_4', min_value: 16, max_value: 256, step: 16, sampling: linear, default: 16)],
              'learning_rate': [Choice(name: 'learning_rate', values: [0.1, 0.01, 0.001, 0.0001], ordered: True, default: 0.1)]}),
 '_space': [Int(name: 'units_1', min_value: 16, max_value: 256, step: 16, sampling: linear, default: 16),
  Int(name: 'units_2', min_value: 16, max_value: 256, step: 16, sampling: linear, default: 16),
  Int(name: 'units_3', min_value: 16, max_value: 256, step: 16, sampling: linear, default: 16),
  Int(name: 'units_

In [34]:
# TensorBoard
log_dir = "logs/fit/" + 'model_kt3'
tensorboard = TensorBoard(log_dir=log_dir)
# EarlyStopping
es = EarlyStopping(patience=50, monitor='val_auc', restore_best_weights=True)
# Build Model with Best hyperparamethers
model_kt_3 = tuner_kt_3.hypermodel.build(best_hps)
# Train model
history_kt_3 = model_kt_3.fit(X_train_val, y_train_val, validation_data=(X_val, y_val),
                              epochs=500,
                              batch_size=128,
                              verbose=0,
                              callbacks=[es, tensorboard])

KeyboardInterrupt: 

In [26]:
results = model_kt_3.evaluate(X_test_scaled, y_test)
print(f'Model {model_kt_3.metrics_names[0]} is {results[0]}')
print(f'Model {model_kt_3.metrics_names[1]} is {results[1]}')
print(f'Model {model_kt_3.metrics_names[2]} is {results[2]}')
print(f'Model {model_kt_3.metrics_names[3]} is {results[3]}')
print(f'Model {model_kt_3.metrics_names[4]} is {results[4]}')

Model loss is 0.4775252640247345
Model accuracy is 0.8217821717262268
Model recall is 0.850731372833252
Model precision is 0.9096665978431702
Model auc is 0.8793954253196716


In [None]:
# Save model
model_kt_3.save('model_kt_3.h5')

In [27]:
plot_loss_precision_recall_curve(history_kt_3)

NameError: name 'plot_loss_precision_recall_curve' is not defined

# Model_4 - model_3 NO KT

In [130]:
# Callbacks
log_dir = "logs/fit/" + '64-02_64-02_lr001-bs64-100k'
tensorboard = TensorBoard(log_dir=log_dir)
es = EarlyStopping(patience=20, monitor='val_auc', restore_best_weights=True)

model_4 = Sequential()
model_4.add(layers.Dense(64, activation='relu', input_dim=X.shape[1]))
model_4.add(layers.Dropout(0.2))
model_4.add(layers.Dense(64, activation='relu'))
model_4.add(layers.Dropout(0.2))
model_4.add(layers.Dense(8, activation='relu'))


model_4.add(layers.Dense(1, activation='sigmoid'))

adam = tf.keras.optimizers.Adam(learning_rate=0.001)
model_4.compile(loss='binary_crossentropy', optimizer=adam, metrics=metrics)

history_one = model_4.fit(X_train_val, y_train_val, validation_data=(X_val, y_val),
                    batch_size=64,
                    epochs=500,
                    verbose=0,
                    shuffle=True,
                    callbacks=[es,tensorboard])

results = model_4.evaluate(X_test_scaled, y_test)
print(f'Model {model_4.metrics_names[0]} is {results[0]}')
print(f'Model {model_4.metrics_names[1]} is {results[1]}')
print(f'Model {model_4.metrics_names[2]} is {results[2]}')
print(f'Model {model_4.metrics_names[3]} is {results[3]}')
print(f'Model {model_4.metrics_names[4]} is {results[4]}')

Model loss is 0.3102816343307495
Model accuracy is 0.8517027497291565
Model recall is 0.9078420400619507
Model precision is 0.8982018828392029
Model auc is 0.9121268391609192
