In [7]:
import time

# visualizatoin 
import matplotlib.pyplot as plt 

# data wrangling
import pandas as pd
import numpy as np 

# data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


# learning
from sklearn.linear_model import LogisticRegression

In [8]:
test_size = 0.2 # proportion for train versus test+val split
val_size = 0.5 # proportion for test versus val split
random_state = 42  # random state is used to set a seed for randomness, which is only relevant for reproducibility purposes
max_missing = 0.8  # maximum percentage of missing values for a column to be dropped

In [9]:
df = pd.read_csv('training_v2.csv')

In [10]:
start_time = time.time()

# save features
X = df.copy().drop(['hospital_death', 'patient_id', 'encounter_id', 'hospital_id', 'icu_id', # drop identifiers
                    'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', # drop APACHE scores
                    'apache_2_bodysystem'], # drop because of similarity with apache_3j_bodysystem
                   axis=1)
# save target variable
y = df['hospital_death'].copy()
# save APACHE scores for later evaluation on train / test / validation data
y_apache = df['apache_4a_hospital_death_prob'].copy()

""" SPLIT DATA SET """
# split the dataset into train and test+validation set
(
    X_train,
    X_test,
    y_train,
    y_test,
    y_apache_train,
    y_apache_test,
    ) = train_test_split(X, y, y_apache, 
                         test_size=test_size, # used for testing and validation
                         random_state=random_state # for reproducibility
                        ) 
# split the test set into test + validation set
(
    X_val,
    X_test,
    y_val,
    y_test,
    y_apache_val,
    y_apache_test,
    ) = train_test_split(X_test, y_test, y_apache_test, 
                         test_size=val_size, # used for testing and validation
                         random_state=random_state # for reproducibility
                        ) 

"""MISSING VALUES"""
# drop columns with many missing values
missing = X_train.isna().sum() > max_missing * len(X_train)
missing = missing[missing].index
X_train = X_train.drop(missing, axis=1)
X_val = X_val.drop(missing, axis=1)
X_test = X_test.drop(missing, axis=1)

"""FURTHER PROCESSING PIPELINE"""
# define pre-processing steps for numerical features
num_transformer = Pipeline(steps=[("constant", VarianceThreshold()), # remove constant features
                                  ("imputer", SimpleImputer(strategy="mean")),
                                  ("scaler", StandardScaler())
                                 ])
# define preprocessing steps for categorical features
cat_transformer = Pipeline(steps=[("encoder", OneHotEncoder(drop='first', handle_unknown="ignore"))])
# create preprocessing pipeline
prep_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_transformer, make_column_selector(dtype_exclude=object)), # apply to columns NOT of type object (int or float)
        ('cat', cat_transformer, make_column_selector(dtype_include=object)) # apply to columns of type object
    ])
# pipeline
prep_pipeline.fit(X_train, y_train)
# display(prep_pipeline) # display preprocessing pipeline


        
"""PRINT STATS"""
print("Time: %.2fs" % (time.time() - start_time))
print("Train set: %s rows, %s columns" % X_train.shape)
print("Validation set: %s rows, %s columns" % X_val.shape)
print("Test set: %s rows, %s columns" % X_test.shape)

Time: 1.07s
Train set: 73370 rows, 144 columns
Validation set: 9171 rows, 144 columns
Test set: 9172 rows, 144 columns


In [11]:
X_train

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,...,d1_pao2fio2ratio_min,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem
4609,86.0,28.168975,0,Caucasian,M,170.2,Direct Admit,Accident & Emergency,admit,Neuro ICU,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neurological
75674,72.0,36.635088,0,Caucasian,M,170.1,,Accident & Emergency,admit,Med-Surg ICU,...,427.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Gastrointestinal
84022,36.0,27.459684,0,African American,M,162.6,Emergency Department,Accident & Emergency,admit,MICU,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic
38035,60.0,21.977351,0,Caucasian,M,181.0,Emergency Department,Accident & Emergency,admit,MICU,...,245.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Genitourinary
24371,27.0,19.960244,0,Caucasian,F,177.8,Emergency Department,Accident & Emergency,admit,Med-Surg ICU,...,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Sepsis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,16.0,22.378743,1,Hispanic,M,165.1,Operating Room,Operating Room / Recovery,admit,Neuro ICU,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neurological
54886,70.0,29.475309,1,Caucasian,M,180.0,Recovery Room,Operating Room / Recovery,admit,Neuro ICU,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
76820,72.0,30.827304,1,,M,187.0,Emergency Department,Operating Room / Recovery,admit,Med-Surg ICU,...,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular
860,46.0,61.339079,0,African American,M,167.6,Emergency Department,Accident & Emergency,admit,MICU,...,46.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory


In [12]:
y_train

4609     0
75674    1
84022    0
38035    0
24371    0
        ..
6265     0
54886    0
76820    0
860      0
15795    0
Name: hospital_death, Length: 73370, dtype: int64

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.tree import export_graphviz
import collections


In [14]:
rf = RandomForestClassifier()

In [15]:
rf_train = X_train.drop(['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'hospital_admit_source',
                        'apache_3j_bodysystem'], axis=1)

In [16]:
rf_train.fillna(rf_train.mean(), inplace=True)

In [17]:
rf_train

Unnamed: 0,age,bmi,elective_surgery,height,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,...,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
4609,86.000000,28.168975,0,170.2,0.000000,0,81.6,2.90288,301.00000,408.020000,...,286.173358,223.509507,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
75674,72.000000,36.635088,0,170.1,0.088889,0,106.0,2.90288,124.00000,305.010000,...,427.000000,427.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
84022,36.000000,27.459684,0,162.6,0.044444,0,72.6,2.90288,122.00000,703.030000,...,286.173358,223.509507,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
38035,60.000000,21.977351,0,181.0,0.213194,0,72.0,2.90288,305.00000,901.030000,...,386.666667,245.600000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
24371,27.000000,19.960244,0,177.8,0.052083,0,63.1,2.50000,113.00000,501.040000,...,286.173358,223.509507,0.000000,0.000000,0.00000,0.000000,0.000000,1.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,16.000000,22.378743,1,165.1,0.004167,0,61.0,2.90288,219.00000,1504.040000,...,286.173358,223.509507,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
54886,70.000000,29.475309,1,180.0,0.560417,0,95.5,2.90288,185.43556,0.250000,...,286.173358,223.509507,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
76820,72.000000,30.827304,1,187.0,0.236806,0,107.8,2.90288,202.00000,1205.010000,...,286.173358,223.509507,0.000000,0.000000,1.00000,0.000000,0.000000,0.000000,0.000000,0.000000
860,46.000000,61.339079,0,167.6,0.000000,0,172.3,3.40000,102.00000,206.010000,...,138.333333,46.000000,0.000000,0.000000,1.00000,0.000000,0.000000,0.000000,0.000000,0.000000


In [18]:
rf.fit(rf_train, y_train)

In [19]:
rf_test = X_test.drop(['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'hospital_admit_source',
                        'apache_3j_bodysystem'], axis=1)
rf_test.fillna(rf_test.mean(), inplace=True)

In [20]:
y_pred = rf.predict(rf_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9305494984736153


In [22]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
counter = collections.Counter(y_pred)
counter

Counter({0: 8910, 1: 262})

In [24]:
CM = confusion_matrix(y_test, y_pred)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]


In [25]:
FN

583

In [31]:
from sklearn.metrics import roc_curve, roc_auc_score


y_pred_probabilities = rf.predict_proba(rf_test)[:, 1]
roc_auc_score(y_test, y_pred_probabilities)

0.878064072745363