### Imports and load the data

In [36]:
from showupforhealth.utils import perform_train_test_split, scale_df
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_columns', 40)

from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV

# DL model with UNBALANCED data

### Load and shuffle the data

In [2]:
data = pd.read_csv('/Users/alessio/code/janduplessis883/data-showup/data/output-data/full_train_data.csv')

In [3]:
data['Sex'].value_counts()

0    571356
2    330086
1        52
3        22
Name: Sex, dtype: int64

In [4]:
filt_sex_1 = (data['Sex'] != 1)
data = data[filt_sex_1]
data['Sex'].value_counts()

0    571356
2    330086
3        22
Name: Sex, dtype: int64

In [5]:
filt_sex_2 = (data['Sex'] != 3)
data = data[filt_sex_2]
data['Sex'].value_counts()

0    571356
2    330086
Name: Sex, dtype: int64

In [23]:
data_dna = data[data['Appointment_status']==0][:2000]

In [24]:
data_no_dna = data[data['Appointment_status']==1][:2000]

In [25]:
data_balanced = pd.concat([data_dna, data_no_dna]).sample(frac = 1)
data_balanced['Appointment_status'].value_counts()

0    2000
1    2000
Name: Appointment_status, dtype: int64

In [26]:
data_balanced.head(5)

Unnamed: 0,Appointment_status,temp,precipitation,Age,Sex,FRAILTY,DEPRESSION,OBESITY,IHD,DM,HPT,NDHG,SMI,IMD2023,dist_to_station,distance_to_surg,book_to_app_days,booked_by_clinician,registered_for_months,sin_week,cos_week,sin_Appointment_time,cos_Appointment_time,sin_month,cos_month,sin_day_of_week,cos_day_of_week,No_shows,Rota_ARRS,Rota_GP,Rota_HCA,Rota_Nurse,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Mixed,Ethnicity_Other,Ethnicity_White
11322,0,4.4,0.0,34,2,0.0,0,0,0,0,0,0,0,4330.0,0.109378,0.441702,6.0,0,2.0,-0.354605,0.935016,0.5,-0.8660254,-2.449294e-16,1.0,0.781831,0.62349,3.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
379,1,18.4,0.0,68,0,0.08,0,0,0,0,0,0,0,13786.0,0.213182,1.790413,0.0,0,5.0,-0.120537,-0.992709,-0.8660254,0.5,-0.5,-0.866025,0.781831,0.62349,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
39991,0,12.2,0.0,50,0,0.19,1,0,0,0,0,0,0,14414.0,0.332448,0.171967,3.0,1,14.0,0.748511,0.663123,1.224647e-16,-1.0,0.8660254,0.5,-0.433884,-0.900969,7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
33256,0,19.1,0.0,74,2,0.11,0,0,0,0,1,0,0,23375.0,0.586176,16.859957,0.0,0,58.0,0.748511,-0.663123,0.258819,-0.9659258,0.5,-0.866025,0.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
34492,0,14.6,0.0,49,0,0.14,0,0,0,0,1,0,0,16808.0,0.136631,0.204712,7.0,0,35.0,0.464723,-0.885456,-1.0,-1.83697e-16,0.5,-0.866025,0.781831,0.62349,28.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Split in X and y

In [27]:
X = data_balanced.drop(columns=['Appointment_status'])
y = data_balanced['Appointment_status']

### Baseline accuracy

In [28]:
print(f'The baseline accuracy if we assume that all patients will show-up is {y.value_counts()[1] / X.shape[0]}')

The baseline accuracy if we assume that all patients will show-up is 0.5


### Split in Train and Test

In [29]:
X_train, X_test, y_train, y_test = perform_train_test_split(X,y)

✅ OUTPUT: X_train, X_test, y_train, y_test
Train Set:  X_train, y_train - (3200, 36), (3200,)
 Test Set:  X_test, y_test - - (800, 36), (800,)


### Scale X_train and X_test

In [30]:
def mm_scaler(X_train, X_test):
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

X_train_sca, X_test_sca = mm_scaler(X_train, X_test)

In [31]:
X_train_scaled = pd.DataFrame(X_train_scal,columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scal,columns=X.columns)

# Models

### SVM - Support Vector Machine

In [18]:
from sklearn import svm
svm = svm.SVC()
grid = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 
        'C': [1000, 1500, 2000, 3000, 4000, 5000],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
        'degree':[1,2,3,4,5]}

search_svm = GridSearchCV(svm, grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)

search_svm.fit(X_train_scaled, y_train)
search_svm_prediction = search_svm.predict(X_test_scaled) 

print(search_svm.best_params_)
print(classification_report(y_test, search_svm_prediction)) 
print(confusion_matrix(y_test, search_svm_prediction))

{'C': 4000, 'degree': 2, 'gamma': 0.01, 'kernel': 'poly'}
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       169
           1       0.95      0.98      0.96       151

    accuracy                           0.97       320
   macro avg       0.97      0.97      0.97       320
weighted avg       0.97      0.97      0.97       320

[[161   8]
 [  3 148]]


### SGDClassifier

In [19]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(max_iter=100000)
grid = {'loss': ('hinge', 'log_loss', 'modified_huber', 'perceptron', 'huber', 'epsilon_insensitive'), 
         'penalty': ('l2', 'l1', 'elasticnet', None),
         'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1]}

search_sgd = GridSearchCV(sgd, grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)

search_sgd.fit(X_train_scaled, y_train)
search_sgd_prediction = search_sgd.predict(X_test_scaled) 

print(search_sgd.best_params_) 
print(classification_report(y_test, search_sgd_prediction)) 
print(confusion_matrix(y_test, search_sgd_prediction))

{'alpha': 0.001, 'loss': 'hinge', 'penalty': 'l1'}
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       169
           1       0.94      0.99      0.97       151

    accuracy                           0.97       320
   macro avg       0.97      0.97      0.97       320
weighted avg       0.97      0.97      0.97       320

[[160   9]
 [  1 150]]


### Decision Tree Classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

grid = {'criterion':['gini', 'entropy'],
         'max_depth': range(1,10),
          'min_samples_split':range(1,10),
          'min_samples_leaf': range(1,5)}

tree_search = GridSearchCV(tree, grid, scoring ='roc_auc', cv = 5, n_jobs=-1)

tree_search.fit(X_train_scaled, y_train)
tree_search_prediction = tree_search.predict(X_test_scaled) 

print(tree_search.best_params_) 
print(classification_report(y_test, tree_search_prediction)) 
print(confusion_matrix(y_test, tree_search_prediction))

{'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 1}
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       169
           1       0.95      0.95      0.95       151

    accuracy                           0.95       320
   macro avg       0.95      0.95      0.95       320
weighted avg       0.95      0.95      0.95       320

[[161   8]
 [  8 143]]


### XGBoost

In [22]:
import xgboost as xgb
estimator = xgb.XGBClassifier(objective= 'binary:logistic', nthread=4, seed=42)

grid = {'max_depth': range(2, 10, 1),
        'n_estimators': range(60, 220, 20),
        'learning_rate': [2, 1, 0.1, 0.01, 0.05]}

xgb_search = GridSearchCV(
    estimator=estimator,
    param_grid=grid,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 5,
    verbose=True)
xgb_search.fit(X_train_scaled, y_train)

xgb_search_prediction = search_sgd.predict(X_test_scaled) 

print(xgb_search.best_params_) 
print(classification_report(y_test, xgb_search_prediction)) 
print(confusion_matrix(y_test, xgb_search_prediction))

Fitting 5 folds for each of 320 candidates, totalling 1600 fits
{'learning_rate': 1, 'max_depth': 6, 'n_estimators': 160}
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       169
           1       0.94      0.99      0.97       151

    accuracy                           0.97       320
   macro avg       0.97      0.97      0.97       320
weighted avg       0.97      0.97      0.97       320

[[160   9]
 [  1 150]]


In [None]:
# from sklearn.linear_model import LogisticRegression
# log_reg = LogisticRegression(max_iter=1000)
# score_log_reg = cross_val_score(log_reg, X_train_scaled, y_train, cv=5, scoring='recall').mean()
# score_log_reg

In [None]:
# from sklearn.naive_bayes import GaussianNB
# gaussian = GaussianNB()
# score_gaussian = cross_val_score(gaussian, X_train_scaled, y_train, cv=5, scoring='recall').mean()
# score_gaussian