### DEMO with new data & the best model Random Forest

In [22]:
## Load the libraries
import numpy as np
import pandas as pd

import random
random.seed(3)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_absolute_error, mean_squared_error

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

## Connect to the shared drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# References:
# general tuning: https://neptune.ai/blog/hyperparameter-tuning-in-python-complete-guide#:~:text=Some%20of%20the%20best%20hyperparameter,Hyperopt

# LR - https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
# SVM - https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/
# RF - https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv/notebook
# Decision Tree - https://www.kaggle.com/code/gauravduttakiit/hyperparameter-tuning-in-decision-trees
# XGBoost - https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
# Naive Bayes - https://medium.com/analytics-vidhya/how-to-improve-naive-bayes-9fa698e14cba
# KNN - https://medium.datadriveninvestor.com/k-nearest-neighbors-in-python-hyperparameters-tuning-716734bc557f

## Model evaluation 
def evaluate_model(y_true,y_pred):
  cm = confusion_matrix(y_true, y_pred)
  sns.heatmap(cm, annot=True, fmt='d')
  plt.show()
  accuracy = accuracy_score(y_true, y_pred)
  print("Accuracy:", accuracy)
  auc = roc_auc_score(y_true, y_pred)
  print("AUC-ROC:", auc)
  precision = precision_score(y_true, y_pred)
  print("Precision:", precision)
  f1 = f1_score(y_true, y_pred)
  print("F1 Score:", f1)
  recall = recall_score(y_true, y_pred)
  print("Recall:", recall)

import pickle # save model
path = '/content/drive/Shareddrives/DATA245 Group#7/Final Project/Scripts/Saved Models/'

In [24]:
# load best model RF
# since the gridsearch process shows the default RF hyperparameters are the best set, no tuning was done for RF
# getting the RF base model for DEMO

best_model = pickle.load(open(path + 'RF_initial.pkl', 'rb'))

# use new data as testing data
newdata = pd.read_csv('/content/drive/Shareddrives/DATA245 Submission Materials/Python Scripts/demo/DEMO_data.csv')

In [25]:
newdata

Unnamed: 0.1,Unnamed: 0,Hour,HR,O2Sat,Temp,MAP,Resp,BUN,Chloride,Creatinine,...,Hct,Hgb,WBC,Platelets,Age,HospAdmTime,ICULOS,SepsisLabel,0,1
0,414084,71,1.297615,0.905284,2.026143,-0.549834,0.842587,1.614454,2.324858,-0.45306,...,-1.429417,-1.494621,-1.927084,-4.573623,80.62,-0.04,72,1,0,1
1,994658,15,1.010221,-0.36894,-0.500567,0.471191,0.842587,-1.001891,0.678506,-0.149765,...,0.986449,0.848497,-1.257026,0.776099,21.0,-0.05,16,0,0,1
2,1536811,20,0.435432,-0.36894,1.156292,0.173047,1.230268,-0.594849,0.129722,-0.413006,...,-1.429417,-1.494621,0.283477,0.061321,52.0,-86.83,28,0,0,1
3,1552053,53,-0.714146,-0.687496,0.189791,-0.620803,-0.126614,2.85329,-1.333702,1.523919,...,0.116027,-0.272124,-1.407347,-0.887059,62.0,-3.19,54,0,1,0


In [26]:
print(newdata.SepsisLabel)

0    1
1    0
2    0
3    0
Name: SepsisLabel, dtype: int64


In [27]:
justdata = newdata.drop(columns=['SepsisLabel'])
justdata

Unnamed: 0.1,Unnamed: 0,Hour,HR,O2Sat,Temp,MAP,Resp,BUN,Chloride,Creatinine,Glucose,Hct,Hgb,WBC,Platelets,Age,HospAdmTime,ICULOS,0,1
0,414084,71,1.297615,0.905284,2.026143,-0.549834,0.842587,1.614454,2.324858,-0.45306,-0.544695,-1.429417,-1.494621,-1.927084,-4.573623,80.62,-0.04,72,0,1
1,994658,15,1.010221,-0.36894,-0.500567,0.471191,0.842587,-1.001891,0.678506,-0.149765,-0.802622,0.986449,0.848497,-1.257026,0.776099,21.0,-0.05,16,0,1
2,1536811,20,0.435432,-0.36894,1.156292,0.173047,1.230268,-0.594849,0.129722,-0.413006,-0.513832,-1.429417,-1.494621,0.283477,0.061321,52.0,-86.83,28,0,1
3,1552053,53,-0.714146,-0.687496,0.189791,-0.620803,-0.126614,2.85329,-1.333702,1.523919,0.574515,0.116027,-0.272124,-1.407347,-0.887059,62.0,-3.19,54,1,0


In [28]:
best_model.predict(justdata) # 1-sepsis detected in 6hrs advanced, 0-sepsis not detected in 6hrs advanced

array([1, 0, 0, 0])