In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve, f1_score
from sklearn.linear_model import LogisticRegression as logit 
from sklearn.feature_selection import SequentialFeatureSelector as SFS, SelectFromModel
from sklearn.tree import DecisionTreeClassifier as DTC, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier as XGBC

In [2]:
s_train = pd.read_csv('static_train_data_median.csv')
s_test = pd.read_csv('static_test_data_median.csv')
s_holdout = pd.read_csv('static_holdout_data_median.csv')

d_train = pd.read_csv('d_trainset_m2.csv')
d_test = pd.read_csv('d_testset_m2.csv')
d_holdout = pd.read_csv('d_holdoutset_m2.csv')

In [3]:
#convert datatypes
s_train['hosp_admittime']=pd.to_datetime(s_train['hosp_admittime'], format= '%m/%d/%y %H:%M')
s_train['hosp_dischtime']=pd.to_datetime(s_train['hosp_dischtime'], format= '%m/%d/%y %H:%M')
s_train['icu_intime']=pd.to_datetime(s_train['icu_intime'], format= '%m/%d/%y %H:%M')
s_train['icu_outtime']=pd.to_datetime(s_train['icu_outtime'], format= '%m/%d/%y %H:%M')
s_train['id'] = s_train['id'].astype(str)

s_test['hosp_admittime']=pd.to_datetime(s_test['hosp_admittime'], format= '%m/%d/%y %H:%M')
s_test['hosp_dischtime']=pd.to_datetime(s_test['hosp_dischtime'], format= '%m/%d/%y %H:%M')
s_test['icu_intime']=pd.to_datetime(s_test['icu_intime'], format= '%m/%d/%y %H:%M')
s_test['icu_outtime']=pd.to_datetime(s_test['icu_outtime'], format= '%m/%d/%y %H:%M')
s_test['id'] = s_test['id'].astype(str)

s_holdout['hosp_admittime']=pd.to_datetime(s_holdout['hosp_admittime'], format= '%m/%d/%y %H:%M')
s_holdout['hosp_dischtime']=pd.to_datetime(s_holdout['hosp_dischtime'], format= '%m/%d/%y %H:%M')
s_holdout['icu_intime']=pd.to_datetime(s_holdout['icu_intime'], format= '%m/%d/%y %H:%M')
s_holdout['icu_outtime']=pd.to_datetime(s_holdout['icu_outtime'], format= '%m/%d/%y %H:%M')
s_holdout['id'] = s_holdout['id'].astype(str)


d_train['charttime']=pd.to_datetime(d_train['charttime'], format= '%m/%d/%y %H:%M')
d_train['id'] = d_train['id'].astype(str)

d_test['charttime']=pd.to_datetime(d_test['charttime'], format= '%m/%d/%y %H:%M')
d_test['id'] = d_test['id'].astype(str)

d_holdout['charttime']=pd.to_datetime(d_holdout['charttime'], format= '%m/%d/%y %H:%M')
d_holdout['id'] = d_holdout['id'].astype(str)

In [4]:

#drop race
s_train=s_train.drop(['race'], axis=1)
s_test=s_test.drop(['race'], axis=1)
s_holdout=s_holdout.drop(['race'], axis=1)


In [5]:
print(s_train.shape)
print(s_test.shape)
print(s_holdout.shape)

print(d_train.shape)
print(d_test.shape)
print(d_holdout.shape)

(14289, 46)
(4083, 46)
(2042, 46)
(64985, 23)
(18781, 23)
(9838, 23)


In [6]:
s_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4083 entries, 0 to 4082
Data columns (total 46 columns):
 #   Column                                                           Non-Null Count  Dtype         
---  ------                                                           --------------  -----         
 0   id                                                               4083 non-null   object        
 1   hosp_admittime                                                   4083 non-null   datetime64[ns]
 2   hosp_dischtime                                                   4083 non-null   datetime64[ns]
 3   icu_intime                                                       4083 non-null   datetime64[ns]
 4   icu_outtime                                                      4083 non-null   datetime64[ns]
 5   los_icu                                                          4083 non-null   float64       
 6   icu_death                                                        4083 non-null  

# Train data

In [7]:
s_train.sort_values(by=['id'], inplace=True)

d_train.sort_values(by=['id','charttime'], inplace=True)

In [8]:
s_train.head()

Unnamed: 0,id,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,icu_outcome,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU)
12434,20001305,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,0,84.22776,44.0,155.0,12,0,0,0,0,0,1,0,1,0,0,6,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
8833,20001361,2043-05-04 14:55:00,2043-05-18 16:58:00,2043-05-04 16:52:00,2043-05-10 17:59:00,6.05,0,1,30.338465,102.0,183.0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
10615,20002506,2032-03-19 05:42:00,2032-03-28 16:09:00,2032-03-19 05:50:00,2032-03-25 19:23:00,6.56,0,1,24.214207,60.0,170.0,3,0,0,0,0,0,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
7019,20003425,2055-07-21 10:00:00,2055-07-29 14:40:00,2055-07-22 17:13:00,2055-07-26 17:11:00,4.0,0,1,76.551461,72.7,170.0,12,0,1,0,0,0,1,0,1,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
8834,20003491,1997-12-18 04:50:00,1997-12-28 17:29:00,1997-12-18 06:10:00,1997-12-20 19:02:00,2.54,0,1,56.963058,60.0,170.0,6,0,0,1,1,0,0,1,1,1,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [9]:
d_train['chartdate'] = pd.to_datetime(d_train['charttime']).dt.date


In [10]:
d_train.loc[d_train['id'].values=='20001305','chartdate']

16213    1978-03-25
36370    1978-03-25
8267     1978-03-25
Name: chartdate, dtype: object

In [11]:
groupID= d_train.groupby('id')

merged=pd.DataFrame()

for patient, group in groupID:
    icu_in= s_train.loc[s_train['id'].values ==patient, 'icu_intime'].item()
    d =icu_in.date()
    icu_results = group.loc[group['chartdate']>=d]
    icu_results = icu_results.mean(numeric_only=True)
    merge_ID= pd.concat([s_train.loc[s_train['id']==patient].reset_index(drop=True), icu_results.to_frame().T.reset_index(drop=True)], axis=1)
    merged = pd.concat([merged,merge_ID])
#     print(merged)
merged.head()

Unnamed: 0,id,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,icu_outcome,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,hematocrit,hemoglobin,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt
0,20001305,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,0,84.22776,44.0,155.0,12,0,0,0,0,0,1,0,1,0,0,6,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,13.666667,24.0,48.333333,11.0,107.666667,0.866667,144.666667,141.0,4.366667,30.0,9.8,30.1,32.7,91.0,182.0,3.3,15.6,9.9,1.333333,14.333333,32.766667
0,20001361,2043-05-04 14:55:00,2043-05-18 16:58:00,2043-05-04 16:52:00,2043-05-10 17:59:00,6.05,0,1,30.338465,102.0,183.0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,15.4,22.2,32.6,7.26,107.4,2.58,128.0,140.4,4.66,33.12,11.14,30.32,33.56,90.0,209.6,3.684,14.22,15.26,1.6,17.38,31.24
0,20002506,2032-03-19 05:42:00,2032-03-28 16:09:00,2032-03-19 05:50:00,2032-03-25 19:23:00,6.56,0,1,24.214207,60.0,170.0,3,0,0,0,0,0,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,15.0,22.333333,16.333333,8.833333,103.666667,0.9,106.666667,142.333333,4.1,35.933333,12.066667,30.1,33.5,89.333333,193.333333,4.02,13.133333,9.633333,1.266667,13.6,31.033333
0,20003425,2055-07-21 10:00:00,2055-07-29 14:40:00,2055-07-22 17:13:00,2055-07-26 17:11:00,4.0,0,1,76.551461,72.7,170.0,12,0,1,0,0,0,1,0,1,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,12.0,21.0,23.0,8.55,100.0,0.85,148.0,133.0,5.0,24.45,7.6,27.9,31.05,90.0,214.0,2.72,15.65,10.1,1.4,15.0,33.5
0,20003491,1997-12-18 04:50:00,1997-12-28 17:29:00,1997-12-18 06:10:00,1997-12-20 19:02:00,2.54,0,1,56.963058,60.0,170.0,6,0,0,1,1,0,0,1,1,1,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,16.0,19.333333,44.333333,7.833333,98.333333,1.9,140.0,133.666667,5.5,32.033333,10.466667,29.966667,32.7,91.0,169.666667,3.523333,15.7,8.6,1.533333,16.5,36.5


In [12]:
merged.shape

(14289, 67)

In [13]:
merged= merged.dropna()
merged.shape

(14283, 67)

In [28]:
merged.head()

Unnamed: 0,id,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,icu_outcome,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,hematocrit,hemoglobin,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt
0,20001305,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,0,84.22776,44.0,155.0,12,0,0,0,0,0,1,0,1,0,0,6,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,13.666667,24.0,48.333333,11.0,107.666667,0.866667,144.666667,141.0,4.366667,30.0,9.8,30.1,32.7,91.0,182.0,3.3,15.6,9.9,1.333333,14.333333,32.766667
0,20001361,2043-05-04 14:55:00,2043-05-18 16:58:00,2043-05-04 16:52:00,2043-05-10 17:59:00,6.05,0,1,30.338465,102.0,183.0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,15.4,22.2,32.6,7.26,107.4,2.58,128.0,140.4,4.66,33.12,11.14,30.32,33.56,90.0,209.6,3.684,14.22,15.26,1.6,17.38,31.24
0,20002506,2032-03-19 05:42:00,2032-03-28 16:09:00,2032-03-19 05:50:00,2032-03-25 19:23:00,6.56,0,1,24.214207,60.0,170.0,3,0,0,0,0,0,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,15.0,22.333333,16.333333,8.833333,103.666667,0.9,106.666667,142.333333,4.1,35.933333,12.066667,30.1,33.5,89.333333,193.333333,4.02,13.133333,9.633333,1.266667,13.6,31.033333
0,20003425,2055-07-21 10:00:00,2055-07-29 14:40:00,2055-07-22 17:13:00,2055-07-26 17:11:00,4.0,0,1,76.551461,72.7,170.0,12,0,1,0,0,0,1,0,1,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,12.0,21.0,23.0,8.55,100.0,0.85,148.0,133.0,5.0,24.45,7.6,27.9,31.05,90.0,214.0,2.72,15.65,10.1,1.4,15.0,33.5
0,20003491,1997-12-18 04:50:00,1997-12-28 17:29:00,1997-12-18 06:10:00,1997-12-20 19:02:00,2.54,0,1,56.963058,60.0,170.0,6,0,0,1,1,0,0,1,1,1,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,16.0,19.333333,44.333333,7.833333,98.333333,1.9,140.0,133.666667,5.5,32.033333,10.466667,29.966667,32.7,91.0,169.666667,3.523333,15.7,8.6,1.533333,16.5,36.5


In [14]:
# merged.to_csv('Mmerged.csv', index=False)

# Test data

In [15]:
s_test.sort_values(by=['id'], inplace=True)

d_test.sort_values(by=['id','charttime'], inplace=True)

d_test['chartdate'] = pd.to_datetime(d_test['charttime']).dt.date

grouptID= d_test.groupby('id')

mergedT=pd.DataFrame()

for patient, group in grouptID:
    icu_in= s_test.loc[s_test['id'].values ==patient, 'icu_intime'].item()
    d =icu_in.date()
    icu_results = group.loc[group['chartdate']>=d]
    icu_results = icu_results.mean(numeric_only=True)
    merge_ID= pd.concat([s_test.loc[s_test['id']==patient].reset_index(drop=True), icu_results.to_frame().T.reset_index(drop=True)], axis=1)
    mergedT = pd.concat([mergedT,merge_ID])


In [16]:
mergedT.shape

(4083, 67)

In [17]:
mergedT= mergedT.dropna()
mergedT.shape

(4080, 67)

In [18]:
# mergedT.to_csv('MmergedTest.csv', index=False)

In [19]:
merged_icu_mean=pd.concat([merged,mergedT])

In [20]:
merged_icu_mean.shape

(18363, 67)

In [21]:
# merged_icu_mean.to_csv('Mmerged_icu_mean.csv', index=False)

# Holdout data

In [22]:
s_holdout.sort_values(by=['id'], inplace=True)

d_holdout.sort_values(by=['id','charttime'], inplace=True)

d_holdout['chartdate'] = pd.to_datetime(d_holdout['charttime']).dt.date

grouptID= d_holdout.groupby('id')

mergedH=pd.DataFrame()

for patient, group in grouptID:
    icu_in= s_holdout.loc[s_holdout['id'].values ==patient, 'icu_intime'].item()
    d =icu_in.date()
    icu_results = group.loc[group['chartdate']>=d]
    icu_results = icu_results.mean(numeric_only=True)
    merge_ID= pd.concat([s_holdout.loc[s_holdout['id']==patient].reset_index(drop=True), icu_results.to_frame().T.reset_index(drop=True)], axis=1)
    mergedH = pd.concat([mergedH,merge_ID])


In [23]:
s_holdout.head()

Unnamed: 0,id,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,admission_age,weight_admit,height,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,cld,copd,diabetes,hypertension,ihd,stroke,icu_outcome,race_encode_African,race_encode_Asian,race_encode_Caucasian,race_encode_Hispanic,race_encode_Not Specified,race_encode_South American,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,admission_type_URGENT,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU)
781,20017191,2032-12-03 17:42:00,2032-12-08 15:46:00,2032-12-03 17:42:00,2032-12-06 18:02:00,3.01,0,0,84.924695,52.3,163.0,4,0,0,0,0,0,0,0,1,0,0,3,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
782,20036035,2022-10-05 19:51:00,2022-10-09 18:00:00,2022-10-05 21:56:00,2022-10-09 19:49:00,3.91,1,0,84.760666,60.0,152.0,7,0,0,0,0,0,0,1,1,0,0,7,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1532,20037205,2022-03-05 19:44:00,2022-03-13 14:28:00,2022-03-05 19:45:00,2022-03-08 22:18:00,3.11,0,0,47.17474,99.0,159.0,0,1,0,0,0,0,0,0,1,0,0,3,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
1756,20039772,2065-01-19 00:40:00,2065-02-04 16:57:00,2065-01-19 04:35:00,2065-01-22 19:03:00,3.6,0,1,58.051521,76.8,185.0,6,1,0,1,1,0,0,1,1,1,1,3,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
507,20042202,2048-03-27 01:42:00,2048-04-10 13:21:00,2048-03-27 03:48:00,2048-03-31 22:28:00,4.78,0,1,63.235742,79.6,178.0,8,0,0,1,1,1,0,1,1,1,0,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [24]:
d_holdout.head()

Unnamed: 0,id,charttime,aniongap,bicarbonate,bun,calcium,chloride,creatinine,glucose,sodium,potassium,hematocrit,hemoglobin,mch,mchc,mcv,platelet,rbc,rdw,wbc,inr,pt,ptt,chartdate
4022,20017191,2032-12-04 04:56:00,31.0,16.0,35.0,9.2,99.0,1.6,156.0,140.0,5.5,53.8,17.8,30.6,33.1,92.0,114.0,5.82,14.5,25.7,0.9,10.2,22.4,2032-12-04
915,20017191,2032-12-04 14:19:00,18.0,23.0,40.0,8.4,105.0,1.5,136.0,141.0,4.7,47.8,16.2,31.2,33.8,92.0,98.0,5.17,14.4,24.5,1.4,15.0,33.5,2032-12-04
5621,20036035,2022-10-05 23:22:00,29.0,16.0,19.0,9.1,102.0,0.8,486.0,143.0,3.5,41.6,13.7,29.8,32.8,91.0,229.0,4.58,13.4,17.8,1.4,15.6,37.7,2022-10-05
7973,20036035,2022-10-06 03:36:00,31.0,13.0,18.0,9.1,104.0,1.0,386.0,144.0,4.0,41.1,13.5,29.7,32.7,91.0,206.0,4.54,13.5,18.8,1.3,14.4,36.0,2022-10-06
9440,20037205,2022-03-05 21:55:00,17.0,21.0,11.0,8.7,104.0,0.6,129.0,138.0,4.2,36.6,11.8,27.9,32.2,87.0,410.0,4.23,13.2,18.7,1.4,14.8,27.0,2022-03-05


In [25]:
mergedH.shape

(2042, 67)

In [26]:
mergedH= mergedH.dropna()
mergedH.shape

(2041, 67)

In [27]:
# mergedH.to_csv('MmergedHoldout.csv', index=False)