In [31]:
import pandas as pd
import numpy as np

import sklearn.preprocessing as skpp

import datetime

from mimicpreprocess import DataHandler

In [2]:
dh = DataHandler()
dh.connect()

In [3]:
admissions = dh.admissions_query()
patients = dh.patient_query()

In [4]:
# Join patients and admission dataframes to obtain gender and age of patient
patient_info = admissions.join(patients.set_index('subject_id'), on='subject_id')

In [5]:
patient_info['age'] = patient_info.apply (lambda row: dh.age (row),axis=1)

In [6]:
patient_vitals = dh.lab_event_query(patient_info)

In [7]:
patient_vitals.isnull().sum()

oxygen                   836
pco2                     412
PH                       355
po2                      412
tempurature              634
lipase                   436
hematocrit                 9
hemoglobin                10
INR                       68
lymphocytes               58
alkaline phosphatase     198
MCH                       11
amylase                  627
neutrophils               58
BUN                        8
platelet                   8
bicarbonate               14
CRP                      993
PTT                       68
PT                        72
RBCDW                     10
calcium                   23
ESR                     1120
creatinine                 8
WBC                     1172
glucose                    9
AST                      183
lactate                   86
dtype: int64

In [41]:
# Vitals that will be kept. The rest dropped due to lack of data
keep_vitals = ['bicarbonate' ,'INR' ,'MCH' ,'AST','alkaline phosphatase' , 'creatinine', 'platelet', 'PT', 'PTT', 'lymphocytes', 'RBCDW', 'calcium', 'neutrophils', 'glucose', 'hematocrit', 'hemoglobin', 'lactate', 'BUN']
patient_vitals_fixed = patient_vitals[keep_vitals]

In [42]:
# Find all hadm_ids where there are 6 or greater missing values. 
drop_hadm_ids = patient_vitals_fixed[patient_vitals_fixed.isnull().sum(axis=1) >= 6].index

In [43]:
patient_vitals_final = patient_vitals_fixed[~patient_vitals_fixed.index.isin(drop_hadm_ids)]

In [44]:
# Replace NaN values with the mean of each column. In the actual ML phase it may be worth figuring out WHY
# Some values are missing. Here we will simply continue with mean.
patient_vitals_final = patient_vitals_final.fillna(patient_vitals.mean())

In [45]:
# Add age column prior to standardization
keep_vitals.append('age')

In [46]:
patient_info = patient_info[~patient_info.hadm_id.isin(drop_hadm_ids)]

In [47]:
# New patient info dataframe finally with vitals
vital_patient_info = patient_info.join(patient_vitals_final, on='hadm_id')
# Standardize the data from vital columns
stdsc = skpp.StandardScaler()
vital_patient_info[keep_vitals] = stdsc.fit_transform(vital_patient_info[keep_vitals])

In [48]:
y_vitals = vital_patient_info['death_period']
X_vitals = vital_patient_info.drop(['death_period', 'subject_id', 'hadm_id', 'admittime', 'ethnicity', 'dob'], 1)
X_vitals = pd.get_dummies(X_vitals)

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV

param_grid = {
              "n_estimators": list(range(1,1000)),
             }

forest = RandomForestClassifier( random_state=42)

In [50]:
rand_for = RandomizedSearchCV(forest, param_grid, scoring = 'accuracy', n_iter=20, random_state=42)
_ = rand_for.fit(X_vitals,y_vitals)

In [51]:
print(rand_for.best_score_)
print(rand_for.best_params_)
print(rand_for.best_estimator_)

0.829732065687
{'n_estimators': 373}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=373, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


In [40]:
importances = rand_for.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_vitals.shape[1]):
    print "{}) {} {}".format(f, X_vitals.columns[indices[f]], importances[indices[f]])

0) lactate 0.10169740538
1) PH 0.0957207952629
2) BUN 0.0653723517137
3) bicarbonate 0.0610648516347
4) RBCDW 0.0524488090263
5) PT 0.0488486405863
6) calcium 0.0466598960835
7) platelet 0.0429403892511
8) AST 0.042490336142
9) hematocrit 0.041786466757
10) lymphocytes 0.0409871260711
11) creatinine 0.0397929653068
12) age 0.039282650099
13) alkaline phosphatase 0.039056730029
14) hemoglobin 0.0381365597273
15) glucose 0.0377600935078
16) neutrophils 0.037352046988
17) MCH 0.0363418698086
18) PTT 0.0361581634321
19) INR 0.0352000572538
20) gender_F 0.00545707548579
21) insurance_Private 0.00437659390715
22) gender_M 0.00408658802974
23) insurance_Medicare 0.00345513230779
24) insurance_Medicaid 0.00204048070053
25) insurance_Self Pay 0.000641049700574
26) insurance_Government 0.000617778428711
27) admission_type_URGENT 0.000153892308281
28) admission_type_EMERGENCY 7.32050702833e-05
