# Settings, Directory Specs, and Imports

In [1]:

# 0 = no streamlit
# 1 = test user inputs
# 2 = run in streamlit
streamlit_status = 0
#file_name_pickle_read = 'model_2020_06_06_1105.pickle'
do_plots = 1

dir_read = '/Users/rachellehorwitz/Documents/ViTalErt/data/filtered05/'
#dir_read = '/Users/rachellehorwitz/Documents/VTAlert/over18_eicu/'

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, confusion_matrix, plot_confusion_matrix, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import streamlit as st
import pickle
from datetime import datetime
import pyprojroot
from pandas_profiling import ProfileReport

pd.set_option('display.max_columns', None)

matplotlib.rcParams.update({'font.size': 22})





# eICU Data Wrangling

In [2]:
# Read files from csv
patient = pd.read_csv(dir_read + 'patient.csv')
admissiondx = pd.read_csv(dir_read + 'admissiondx.csv')
diagnosis = pd.read_csv(dir_read + 'diagnosis.csv')
ptusid_pos = pd.read_csv(dir_read + 'pos_ptunitstayid.csv')
hr = pd.read_csv(dir_read + 'vitalPeriodic_heartrate_first24.csv')
hr = hr.set_index('patientunitstayid')

print('There are ' + str(patient.shape[0]) + ' patientunitstayids in "patient.csv" but only ' \
      + str(hr.shape[0]) + ' patientunitstayids in the hr csv file')
print('Cleaning that up for patient, admissiondx, diagnosis, and ptusid_pos.')
patient = patient[patient['patientunitstayid'].isin(list(hr.index))]
admissiondx = admissiondx[admissiondx['patientunitstayid'].isin(list(hr.index))]
diagnosis = diagnosis[diagnosis['patientunitstayid'].isin(list(hr.index))]
ptusid_pos = ptusid_pos[ptusid_pos['patientunitstayid'].isin(list(hr.index))]

hr = hr.reset_index()

There are 161543 patientunitstayids in "patient.csv" but only 158784 patientunitstayids in the hr csv file
Cleaning that up for patient, admissiondx, diagnosis, and ptusid_pos.


In [3]:


# Create dataframe of patientunitstayids in negative class
tmp = patient[~patient['patientunitstayid'].isin(ptusid_pos['patientunitstayid'])]
ptusid_neg = tmp[['patientunitstayid']]

# Create labels
ptusid_neg = ptusid_neg.assign(label=0)
ptusid_pos = ptusid_pos.assign(label=1)

# Add labels to "patient" dataframe and move 'label' column to left
ptusid_w_label = pd.concat([ptusid_pos, ptusid_neg])
ptusid_w_label.shape[0]
ptusid_w_label = ptusid_w_label.set_index('patientunitstayid')
patient = patient.set_index('patientunitstayid')
patient = patient.join(ptusid_w_label)

# Move 'label' column to left
cols = list(patient.columns)
cols = [cols[-1]] + cols[:-1]
patient = patient[cols]

# Add BMI column to patient
patient = patient.assign(bmi=lambda x: x.admissionweight/((x.admissionheight/100)**2))

# Remove patients who weigh less than 40 kg or more than 400 kg
patient = patient[(patient['admissionweight'] > 40) & \
                   (patient['admissionweight'] < 400)]

# Remove patients who are shorter than 100 cm (3.2 ft) or taller than 230 cm (~7.5 ft)
patient = patient[(patient['admissionheight'] > 100 ) & (patient['admissionheight'] < 230)]

# Remove patients with BMI > 150
patient = patient[patient['bmi'] < 150]





In [4]:
hr_pos = hr[hr['patientunitstayid'].isin(list(ptusid_pos['patientunitstayid']))]
hr_neg = hr[hr['patientunitstayid'].isin(list(ptusid_neg['patientunitstayid']))]

In [5]:
# Dummy variables for gender, ethnicity, unitstaytype
patient = pd.concat([patient, pd.get_dummies(patient['gender'], prefix='gender')], axis=1)
patient = pd.concat([patient, pd.get_dummies(patient['ethnicity'], prefix='ethnicity')], axis=1)
patient = pd.concat([patient, pd.get_dummies(patient['unitstaytype'], prefix='unitstaytype')], axis=1)

nstds = 5
print(str(169+nstds*patient['admissionheight'].std()))
print(str(169-nstds*patient['admissionheight'].std()))

patient_pos = patient[patient['label']==1]
patient_neg = patient[patient['label']==0]
patient[patient['label']==1].shape[0]


# Drop unneeded columns of patient
patient = patient.drop(columns=['hospitalid', 'wardid', 'dischargeweight', 'unitdischargetime24', \
                                'unitdischargeoffset', 'unitdischargelocation', 'hospitaldischargeyear',\
                               'hospitaldischargetime24', 'hospitaldischargelocation'])
apacheaddx_all = patient['apacheadmissiondx'].value_counts()



223.2660110885446
114.73398891145541


# AdmissionDX

In [6]:
admissiondx = admissiondx[['patientunitstayid', 'diagnosisstring']]
admissiondx.head()

Unnamed: 0,patientunitstayid,diagnosisstring
0,141168,cardiovascular|chest pain / ASHD|coronary arte...
1,141168,cardiovascular|ventricular disorders|cardiomyo...
2,141168,pulmonary|disorders of the airways|COPD
3,141168,pulmonary|disorders of the airways|COPD
4,141168,cardiovascular|ventricular disorders|congestiv...


In [7]:
df = admissiondx.groupby(['patientunitstayid', 'diagnosisstring']).size().reset_index(name='Freq')
df = df.drop(columns='Freq')
df.head(30)

Unnamed: 0,patientunitstayid,diagnosisstring
0,141168,cardiovascular|arrhythmias|atrial fibrillation...
1,141168,cardiovascular|chest pain / ASHD|coronary arte...
2,141168,cardiovascular|ventricular disorders|cardiomyo...
3,141168,cardiovascular|ventricular disorders|congestiv...
4,141168,pulmonary|disorders of the airways|COPD
5,141168,renal|disorder of kidney|chronic kidney disease
6,141203,cardiovascular|chest pain / ASHD|coronary arte...
7,141203,endocrine|glucose metabolism|diabetes mellitus
8,141203,hematology|coagulation disorders|coagulopathy
9,141203,infectious diseases|GI infections|diarrhea due...


In [8]:
foo = df['diagnosisstring'].str.split(pat = '|', expand=True)
foo = foo.rename(columns={0: 'Lev1', 1: 'Lev2', 2: 'Lev3'})
foo.head()

Unnamed: 0,Lev1,Lev2,Lev3,3,4,5
0,cardiovascular,arrhythmias,atrial fibrillation,with hemodynamic compromise,,
1,cardiovascular,chest pain / ASHD,coronary artery disease,known,,
2,cardiovascular,ventricular disorders,cardiomyopathy,,,
3,cardiovascular,ventricular disorders,congestive heart failure,,,
4,pulmonary,disorders of the airways,COPD,,,


In [9]:
admitdx = df.join(foo[['Lev1']])
admitdx.head(10)

Unnamed: 0,patientunitstayid,diagnosisstring,Lev1
0,141168,cardiovascular|arrhythmias|atrial fibrillation...,cardiovascular
1,141168,cardiovascular|chest pain / ASHD|coronary arte...,cardiovascular
2,141168,cardiovascular|ventricular disorders|cardiomyo...,cardiovascular
3,141168,cardiovascular|ventricular disorders|congestiv...,cardiovascular
4,141168,pulmonary|disorders of the airways|COPD,pulmonary
5,141168,renal|disorder of kidney|chronic kidney disease,renal
6,141203,cardiovascular|chest pain / ASHD|coronary arte...,cardiovascular
7,141203,endocrine|glucose metabolism|diabetes mellitus,endocrine
8,141203,hematology|coagulation disorders|coagulopathy,hematology
9,141203,infectious diseases|GI infections|diarrhea due...,infectious diseases


In [10]:
mystrings = foo['Lev1'].unique()
admitdx.columns

Index(['patientunitstayid', 'diagnosisstring', 'Lev1'], dtype='object')

In [11]:
addx = pd.get_dummies(admitdx['Lev1'], prefix='Lev1')
addx2 = admitdx.groupby('patientunitstayid')['Lev1'].unique()
#addx3 = 
#drugs3_1 = drugs3.groupby('SEQN').agg({'year':'first', 'taken_rx_med_30d_1.0’:’sum’}).reset_index()

In [12]:
%matplotlib qt
import seaborn as sns

fig = plt.figure()
VTE = sns.distplot(patient_pos['admissionweight'], color='red', bins=20)
noVTE = sns.distplot(patient_neg['admissionweight'], color='blue', bins=20)
plt.xlabel('Admission Weight (kg)')
fig.legend(labels=['VTE', 'No VTE'])

fig = plt.figure()
sns.distplot(patient_pos['age'], color='red', bins=20)
sns.distplot(patient_neg['age'], color='blue', bins=20)
fig.legend(labels=['VTE', 'No VTE'])
plt.xlabel('Age (yrs)')

fig = plt.figure()
sns.distplot(patient_pos['bmi'], color='red', bins=20)
sns.distplot(patient_neg['bmi'], color='blue', bins=20)
plt.xlabel('BMI (kg/m^2)')

fig = plt.figure()
sns.distplot(patient_pos['admissionheight'], color='red', bins=20)
sns.distplot(patient_neg['admissionheight'], color='blue', bins=20)
plt.xlabel('Height (cm))')

fig = plt.figure()
sns.distplot(hr_pos['heartrate'], color='red', bins=20)
sns.distplot(hr_neg['heartrate'], color='blue', bins=20)
plt.xlabel('Average HR During First 24 Hours')

Text(0.5, 0, 'Average HR During First 24 Hours')

# ApachePredVar

In [13]:
u_ptunitstayid = list(patient.index)
apachepredvar = pd.read_csv('/Users/rachellehorwitz/Documents/ViTalErt/data/over18_eicu/apachePredVar.csv')
apachepredvar = apachepredvar[apachepredvar['patientunitstayid'].isin(u_ptunitstayid)]

# Drop columns based on apachepredvar_allcols.html
apachepredvar = apachepredvar.drop(columns=['apachepredvarid', 'sicuday', 'saps3day1', 'saps3yesterday', 'saps3today', 'gender', \
                                    'teachtype', 'region', 'amilocation', 'bedcount', 'admitsource', 'graftcount', 'age', \
                                   'meds', 'diedinhospital', 'electivesurgery', 'readmit', 'managementsystem', 'var03hspxlos', \
                                   'ejectfx', 'day1meds', 'day1verbal', 'day1motor', 'day1eyes',  'day1pao2', 'day1fio2', \
                                           'ventday1', 'creatinine', 'dischargelocation', 'pao2', 'fio2']) # pao2 and fio2 are <0 too frequently
apachepredvar = apachepredvar.drop(columns=['admitdiagnosis'])
apachepredvar = apachepredvar[apachepredvar['verbal']!=-1] # Get rid of missing data
apachepredvar.set_index('patientunitstayid')

Unnamed: 0_level_0,verbal,motor,eyes,thrombolytics,aids,hepaticfailure,lymphoma,metastaticcancer,leukemia,immunosuppression,cirrhosis,activetx,ima,midur,oobventday1,oobintubday1,diabetes,visitnumber
patientunitstayid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
141168,5,6,4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
141194,4,6,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
141197,5,6,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
141203,1,3,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1
141208,5,6,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353201,5,6,3,0,0,0,0,0,0,0,0,1,0,0,1,1,0,3
3353216,1,5,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1
3353235,5,6,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3353251,1,1,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1


# Define X and y

In [14]:
Xy = patient[['label', 'age', 'admissionweight', 'admissionheight', 'bmi', 'gender_Female',
       'ethnicity_African American', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic',
       'ethnicity_Native American', 'ethnicity_Other/Unknown',
       'unitstaytype_admit', 'unitstaytype_readmit', 'unitstaytype_transfer']]

In [15]:
Xy.shape

(157019, 15)

In [16]:
apachepredvar.shape

(146267, 19)

In [17]:
Xy2 = Xy.merge(apachepredvar, how='right', left_on='patientunitstayid', right_on='patientunitstayid')

In [18]:
Xy3 = Xy2.merge(hr, how='inner', left_on='patientunitstayid', right_on='patientunitstayid')

In [19]:
Xy3.head()

Unnamed: 0,patientunitstayid,label,age,admissionweight,admissionheight,bmi,gender_Female,ethnicity_African American,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown,unitstaytype_admit,unitstaytype_readmit,unitstaytype_transfer,verbal,motor,eyes,thrombolytics,aids,hepaticfailure,lymphoma,metastaticcancer,leukemia,immunosuppression,cirrhosis,activetx,ima,midur,oobventday1,oobintubday1,diabetes,visitnumber,heartrate
0,141168,0,70,84.3,152.4,36.295906,1,0,0,1,0,0,0,1,0,0,5,6,4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,125.05283
1,141194,0,68,73.9,180.3,22.732803,0,0,0,1,0,0,0,1,0,0,4,6,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,86.860627
2,141197,0,71,102.1,162.6,38.617545,0,0,0,1,0,0,0,1,0,0,5,6,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,97.307692
3,141203,0,77,70.2,160.0,27.421875,1,0,0,1,0,0,0,1,0,0,1,3,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,91.543554
4,141208,0,25,95.3,172.7,31.952749,1,0,0,1,0,0,0,1,0,0,5,6,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,77.81746


In [20]:
Xy3.shape[0]

146267

In [21]:
Xy3[Xy3['label']==0].shape[0]

145815

In [22]:
Xy3[Xy3['label']==1].shape[0]

452

In [23]:
Xy3.columns.shape

(35,)

In [24]:
Xy3.columns

Index(['patientunitstayid', 'label', 'age', 'admissionweight',
       'admissionheight', 'bmi', 'gender_Female', 'ethnicity_African American',
       'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic',
       'ethnicity_Native American', 'ethnicity_Other/Unknown',
       'unitstaytype_admit', 'unitstaytype_readmit', 'unitstaytype_transfer',
       'verbal', 'motor', 'eyes', 'thrombolytics', 'aids', 'hepaticfailure',
       'lymphoma', 'metastaticcancer', 'leukemia', 'immunosuppression',
       'cirrhosis', 'activetx', 'ima', 'midur', 'oobventday1', 'oobintubday1',
       'diabetes', 'visitnumber', 'heartrate'],
      dtype='object')

In [25]:
profile = ProfileReport(Xy3, title='Pandas Profiling Report')

In [26]:
profile.to_file("X_2020_06_11.html")

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=49.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [27]:
Xy3.to_csv('Xy_2020_06_11_1916.csv', index=False)