In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import tensorflow_datasets as tfds


pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [783]:
df = pd.read_csv('training_v2.csv')

In [784]:
# composite function to clean dataset
def impute_missing_values(df):
    
    #replace missing ages with mean 
    df['age'].fillna(value=df['age'].describe()['mean'], inplace=True)
    
    #find mean height and weight by gender
    height_weight_by_gender = df.groupby('gender').mean()[['height', 'weight']]
    avg_height = height_weight_by_gender['height'].mean()
    avg_weight = height_weight_by_gender['weight'].mean()
    
    #fill in missing gender based on average height and weight
    #where F are < average and M are > average
    df.loc[(df['height']> avg_height) & (df['gender'].isna()), 'gender'] = 'M'
    df.loc[(df['height']< avg_height) & (df['gender'].isna()), 'gender'] = 'F'
    df.loc[(df['weight']> avg_weight) & (df['gender'].isna()), 'gender'] = 'M'
    df.loc[(df['weight']< avg_weight) & (df['gender'].isna()), 'gender'] = 'F'
    
    #any patients without height or weight information are defaulted to M
    df['gender'].fillna(value='M', inplace=True)
    
    #fill in average height and weight based on patient gender
    df.loc[(df['gender'] == 'F') & (df['weight'].isna()), 'weight'] = height_weight_by_gender.loc['F']['weight']
    df.loc[(df['gender'] == 'F') & (df['height'].isna()), 'height'] = height_weight_by_gender.loc['F']['height']
    df.loc[(df['gender'] == 'M') & (df['weight'].isna()), 'weight'] = height_weight_by_gender.loc['M']['weight']
    df.loc[(df['gender'] == 'M') & (df['height'].isna()), 'height'] = height_weight_by_gender.loc['M']['height']
    df['bmi'] = df['weight'] / (df['height']/100)**2
    

In [785]:
x = ['albumin_apache','bilirubin_apache','fio2_apache','paco2_for_ph_apache', 'paco2_apache','pao2_apache','ph_apache',
'urineoutput_apache','d1_diasbp_invasive_max','d1_diasbp_invasive_min','d1_mbp_invasive_max','d1_mbp_invasive_min',
 'd1_sysbp_invasive_max','d1_sysbp_invasive_min','h1_diasbp_invasive_max','h1_diasbp_noninvasive_min','h1_mbp_invasive_max',
 'h1_mbp_invasive_min','h1_mbp_noninvasive_max', 'h1_mbp_noninvasive_min', 'h1_sysbp_invasive_max','h1_sysbp_invasive_min',
 'h1_sysbp_noninvasive_max','h1_sysbp_noninvasive_min','d1_albumin_max','d1_albumin_min','d1_bilirubin_max','d1_bilirubin_min',
 'd1_inr_max','d1_inr_min','d1_lactate_max','d1_lactate_min','h1_albumin_max','h1_albumin_min','h1_bilirubin_max',
 'h1_bilirubin_min','h1_bun_max','h1_bun_min','h1_calcium_max','h1_calcium_min','h1_creatinine_max','h1_creatinine_min',
 'h1_glucose_max','h1_glucose_min','h1_hco3_max','h1_hco3_min','h1_hemaglobin_max','h1_hemaglobin_min','h1_hematocrit_max',
 'h1_hematocrit_min','h1_inr_max','h1_inr_min','h1_lactate_max','h1_lactate_min','h1_platelets_max','h1_platelets_min',
 'h1_potassium_max','h1_potassium_min','h1_sodium_max','h1_sodium_min' ,'h1_wbc_max','h1_wbc_min','d1_arterial_pco2_max',
 'd1_arterial_pco2_min' ,'d1_arterial_ph_max' ,'d1_arterial_ph_min','d1_arterial_po2_max','d1_arterial_po2_min',
 'd1_pao2fio2ratio_max','d1_pao2fio2ratio_min','h1_arterial_pco2_max','h1_arterial_pco2_min','h1_arterial_ph_max',
 'h1_arterial_ph_min','h1_arterial_po2_max','h1_arterial_po2_min','h1_pao2fio2ratio_max','h1_pao2fio2ratio_min','h1_diasbp_invasive_min' ]

In [786]:
df.drop(['icu_id','readmission_status','hospital_admit_source','encounter_id','patient_id','apache_3j_diagnosis'],axis=1, inplace=True)

In [787]:
df.drop(x, axis=1, inplace=True)

In [788]:
impute_missing_values(df)

In [789]:
# check for NAN values
round(df.isna().sum()/df.shape[0]*100, 2)

hospital_id                       0.00
hospital_death                    0.00
age                               0.00
bmi                               0.00
elective_surgery                  0.00
ethnicity                         1.52
gender                            0.00
height                            0.00
icu_admit_source                  0.12
icu_stay_type                     0.00
icu_type                          0.00
pre_icu_los_days                  0.00
weight                            0.00
apache_2_diagnosis                1.81
apache_post_operative             0.00
arf_apache                        0.78
bun_apache                       21.00
creatinine_apache                20.56
gcs_eyes_apache                   2.07
gcs_motor_apache                  2.07
gcs_unable_apache                 1.13
gcs_verbal_apache                 2.07
glucose_apache                   12.03
heart_rate_apache                 0.96
hematocrit_apache                21.67
intubated_apache         

In [790]:
df.isna().sum()

hospital_id                          0
hospital_death                       0
age                                  0
bmi                                  0
elective_surgery                     0
ethnicity                         1395
gender                               0
height                               0
icu_admit_source                   112
icu_stay_type                        0
icu_type                             0
pre_icu_los_days                     0
weight                               0
apache_2_diagnosis                1662
apache_post_operative                0
arf_apache                         715
bun_apache                       19262
creatinine_apache                18853
gcs_eyes_apache                   1901
gcs_motor_apache                  1901
gcs_unable_apache                 1037
gcs_verbal_apache                 1901
glucose_apache                   11036
heart_rate_apache                  878
hematocrit_apache                19878
intubated_apache         

In [791]:
#fill means
df.d1_heartrate_max.fillna(df.d1_heartrate_max.mean(),inplace=True)
df.d1_heartrate_min.fillna(df.d1_heartrate_min.mean(),inplace=True) 
df.d1_mbp_max.fillna(df.d1_mbp_max.mean(),inplace=True) 
df.d1_mbp_min.fillna(df.d1_mbp_min.mean(),inplace=True) 
df.d1_resprate_max.fillna(df.d1_resprate_max.mean(),inplace=True) 
df.d1_resprate_min.fillna(df.d1_resprate_min.mean(),inplace=True) 
df.d1_spo2_max.fillna(df.d1_spo2_max.mean(),inplace=True) 
df.d1_spo2_min.fillna(df.d1_spo2_min.mean(),inplace=True) 
df.d1_sysbp_max.fillna(df.d1_sysbp_max.mean(),inplace=True) 
df.d1_sysbp_min.fillna(df.d1_sysbp_min.mean(),inplace=True) 

In [792]:
#fill categoricals
df.ethnicity.fillna('Other/Unknown',inplace=True)
df.apache_3j_bodysystem.fillna('Other',inplace=True)
df.icu_admit_source.fillna('Other',inplace=True)

In [793]:
#fill numerical categories
df.gcs_eyes_apache.fillna(4.0, inplace=True)
df.gcs_motor_apache.fillna(6.0, inplace=True)
df.gcs_unable_apache.fillna(0.0, inplace=True)
df.intubated_apache.fillna(0.0, inplace=True)
df.arf_apache.fillna(0.0, inplace=True)
df.ventilated_apache.fillna(0.0, inplace=True)
df.aids.fillna(0.0, inplace=True)
df.cirrhosis.fillna(0.0, inplace=True)
df.diabetes_mellitus.fillna(0.0, inplace=True)
df.hepatic_failure.fillna(0.0, inplace=True)
df.immunosuppression.fillna(0.0, inplace=True)
df.leukemia.fillna(0.0, inplace=True)
df.lymphoma.fillna(0.0, inplace=True)
df.solid_tumor_with_metastasis.fillna(0.0, inplace=True)

In [794]:
# #drop due to multicollinearity or duplicate
df.drop(['d1_diasbp_max','d1_diasbp_noninvasive_max','d1_mbp_noninvasive_max','d1_diasbp_min','d1_diasbp_noninvasive_min','d1_mbp_noninvasive_min','gcs_verbal_apache'],axis=1,inplace=True)
df.drop(['bun_apache','creatinine_apache','hematocrit_apache','sodium_apache','wbc_apache','glucose_apache','d1_hemaglobin_max','d1_hemaglobin_min','heart_rate_apache','apache_2_bodysystem'],axis=1,inplace=True)
df.drop(['temp_apache','h1_temp_max','h1_temp_min','h1_diasbp_noninvasive_max'],axis=1,inplace=True)



In [795]:
#impute median for the following column
cols = ['d1_sysbp_noninvasive_max',
 'd1_sysbp_noninvasive_min',
 'd1_temp_max',
 'd1_temp_min',
 'h1_diasbp_max',
 'h1_diasbp_min',
 'h1_heartrate_max',
 'h1_heartrate_min',
 'h1_mbp_max',
 'h1_mbp_min',
 'h1_resprate_max',
 'h1_resprate_min',
 'h1_spo2_max',
 'h1_spo2_min',
 'h1_sysbp_max',
 'h1_sysbp_min',
 'd1_bun_max',
 'd1_bun_min',
 'd1_calcium_max',
 'd1_calcium_min',
 'd1_creatinine_max',
 'd1_creatinine_min',
 'd1_glucose_max',
 'd1_glucose_min',
 'd1_hco3_max',
 'd1_hco3_min',
 'd1_hematocrit_max',
 'd1_hematocrit_min',
 'd1_platelets_max',
 'd1_platelets_min',
 'd1_potassium_max',
 'd1_potassium_min',
 'd1_sodium_max',
 'd1_sodium_min',
 'd1_wbc_max',
 'd1_wbc_min',
 'map_apache',
 'resprate_apache',
 'apache_4a_hospital_death_prob',
 'apache_4a_icu_death_prob',
'apache_2_diagnosis']

In [796]:
for col in cols:
    df[col].fillna(df[col].median(),inplace=True)

# transform hospital id to percent of death

In [797]:
#create a dictionary to map all the hopsital ids to percentage of hospital deaths
x = df.groupby('hospital_id').sum()['hospital_death']/df.groupby('hospital_id').count()['hospital_death']
x = list(zip(x.index, list(x)))
dic = {i:'less than 10%' if j < 0.1 else '10-20%' if j < 0.2 else 'greater than 20%' for i,j in x}

df['hospital_death_rate'] = df.hospital_id.apply(lambda x: dic[x])

In [798]:
df.drop('hospital_id',inplace=True,axis=1)

# Look at correlated columns

In [None]:
x = df.corr()['hospital_death']
x = list(zip(list(x.index),list(x)))
above_5 = [i for i, j in x if abs(j) > 0.1]
below_5 = [i for i, j in x if abs(j) <= 0.1]

In [None]:
above_5

In [None]:
below_5

# One hot encode categorical

In [822]:
#floats that need to be converted to ints
float_2_int_cols = ['apache_2_diagnosis','arf_apache','gcs_eyes_apache','gcs_motor_apache','gcs_unable_apache',
                   'intubated_apache','ventilated_apache','aids','cirrhosis','diabetes_mellitus','hepatic_failure',
                   'immunosuppression','leukemia','lymphoma','solid_tumor_with_metastasis']
for col in float_2_int_cols:
    df[col] = df[col].astype(int)

In [823]:
categorical_cols = ['hospital_death_rate','ethnicity','gender','icu_admit_source','icu_stay_type','icu_type','gcs_eyes_apache',
                   'gcs_motor_apache','apache_3j_bodysystem']
# hospital_id = 147 unique
# ethnicity = 6 unique
# gender = 2 unique
# icu_admit_source = 5 unique
# icu_stay_type = 3 unique
# apache_2_diagnosis = 44 unique
# icu_type = 8 unique
# gcs_eyes_apache = 4 unique
# gcs_motor_apache = 6 unique
# apache_3j_bodysystem = 12 unique
for col in categorical_cols:
    df[col] = df[col].astype(str)

dummies = pd.get_dummies(df[categorical_cols],drop_first=True)

In [824]:
df_dum = pd.concat([df,dummies],axis=1)

In [825]:
df_dum.drop(categorical_cols,axis=1,inplace=True)

In [826]:
#df_dum.drop(below_5, axis=1, inplace=True)

In [827]:
from sklearn.model_selection import train_test_split

In [828]:
X = df_dum.drop('hospital_death',axis=1)
y = df_dum['hospital_death']

In [829]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2,stratify=y)

In [830]:
continuous_col = ['age',
 'bmi',
 'height',
 'pre_icu_los_days',
 'weight',
 'map_apache',
 'resprate_apache',
 'd1_heartrate_max',
 'd1_heartrate_min',
 'd1_mbp_max',
 'd1_mbp_min',
 'd1_resprate_max',
 'd1_resprate_min',
 'd1_spo2_max',
 'd1_spo2_min',
 'd1_sysbp_max',
 'd1_sysbp_min',
 'd1_sysbp_noninvasive_max',
 'd1_sysbp_noninvasive_min',
 'd1_temp_max',
 'd1_temp_min',
 'h1_diasbp_max',
 'h1_diasbp_min',
 'h1_heartrate_max',
 'h1_heartrate_min',
 'h1_mbp_max',
 'h1_mbp_min',
 'h1_resprate_max',
 'h1_resprate_min',
 'h1_spo2_max',
 'h1_spo2_min',
 'h1_sysbp_max',
 'h1_sysbp_min',
 'd1_bun_max',
 'd1_bun_min',
 'd1_calcium_max',
 'd1_calcium_min',
 'd1_creatinine_max',
 'd1_creatinine_min',
 'd1_glucose_max',
 'd1_glucose_min',
 'd1_hco3_max',
 'd1_hco3_min',
 'd1_hematocrit_max',
 'd1_hematocrit_min',
 'd1_platelets_max',
 'd1_platelets_min',
 'd1_potassium_max',
 'd1_potassium_min',
 'd1_sodium_max',
 'd1_sodium_min',
 'd1_wbc_max',
 'd1_wbc_min',
 'apache_4a_hospital_death_prob',
 'apache_4a_icu_death_prob']

In [833]:
#continuous_col = [x for x in continuous_col if x not in below_5]

In [834]:
ss = StandardScaler()

In [835]:
x_train.loc[:,continuous_col] = ss.fit_transform(x_train[continuous_col])
x_test.loc[:,continuous_col] = ss.transform(x_test[continuous_col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [836]:
#use smote to balance the class sizes
smt = SMOTE()
x_train, y_train = smt.fit_sample(x_train, y_train)

In [837]:
x_train = pd.DataFrame(x_train)
y_train = pd.Series(y_train)

In [838]:
GBC = GradientBoostingClassifier()

In [839]:
GBC.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [840]:
y_pred = GBC.predict(x_test)

In [841]:
accuracy_score(y_test, y_pred)

0.9156081338930382

In [842]:
f1_score(y_test, y_pred)

0.4471428571428571

In [843]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95     16760
           1       0.51      0.40      0.45      1583

    accuracy                           0.92     18343
   macro avg       0.73      0.68      0.70     18343
weighted avg       0.91      0.92      0.91     18343



In [844]:
roc_auc_score(y_test,y_pred)

0.6800945721018519

## RANDOM FOREST

In [845]:
rfc = RandomForestClassifier(max_depth=20, n_estimators=100, min_samples_leaf=2,min_samples_split=5)

In [846]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [847]:
y_pred = rfc.predict(x_test)

In [848]:
accuracy_score(y_test, y_pred)

0.9193152701302949

In [849]:
f1_score(y_test, y_pred)

0.45985401459854014

In [850]:
print(classification_report(y_test,y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.94      0.97      0.96     16760
           1       0.54      0.40      0.46      1583

    accuracy                           0.92     18343
   macro avg       0.74      0.68      0.71     18343
weighted avg       0.91      0.92      0.91     18343



array([[16233,   527],
       [  953,   630]])

In [851]:
roc_auc_score(y_test, y_pred)

0.6832673038564582

In [852]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train.values, y_train.values))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test.values,y_test.values))

In [853]:
BATCH_SIZE = 128
SHUFFLE_BUFFER_SIZE = 120000

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [864]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, kernel_regularizer=tf.keras.regularizers.l2(0.1), activation='relu'),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(64, kernel_regularizer=tf.keras.regularizers.l2(0.1), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    
    tf.keras.layers.Dense(32, kernel_regularizer=tf.keras.regularizers.l2(0.1), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
    
])

In [865]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
             metrics=['acc'])

In [None]:
model.fit(train_dataset,validation_data=test_dataset,epochs=50)

Train for 1048 steps, validate for 144 steps
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50

In [857]:
y_pred = model.predict(test_dataset)

In [858]:

f1_score(y_test,np.round(y_pred))

0.4223031232036789

In [859]:
print(classification_report(y_test,np.round(y_pred)))

              precision    recall  f1-score   support

           0       0.97      0.85      0.90     16760
           1       0.30      0.70      0.42      1583

    accuracy                           0.84     18343
   macro avg       0.64      0.77      0.66     18343
weighted avg       0.91      0.84      0.86     18343



In [860]:
roc_auc_score(y_test,np.round(y_pred))

0.7724766198737479

In [763]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_66 (Dense)             multiple                  65280     
_________________________________________________________________
batch_normalization_55 (Batc multiple                  1024      
_________________________________________________________________
dropout_45 (Dropout)         multiple                  0         
_________________________________________________________________
dense_67 (Dense)             multiple                  32896     
_________________________________________________________________
batch_normalization_56 (Batc multiple                  512       
_________________________________________________________________
dropout_46 (Dropout)         multiple                  0         
_________________________________________________________________
dense_68 (Dense)             multiple                