# Settings, Directory Specs, and Imports

In [1]:
do_plots = 1 # 1: create plots

dir_read = '/Users/rachellehorwitz/Documents/ViTalErt/data/filtered05/' # directory to read data from

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, confusion_matrix, plot_confusion_matrix, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, learning_curve, train_test_split, ShuffleSplit, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import check_random_state
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import streamlit as st
import pickle
from datetime import datetime
import pyprojroot
from pandas_profiling import ProfileReport
from datetime import datetime
from sklearn.model_selection._split import _BaseKFold
from sklearn.model_selection._split import _RepeatedSplits
from collections import defaultdict
from collections import Counter
import sys
sys.path.insert(1, '../src')
from utils import *

pd.set_option('display.max_columns', None)

matplotlib.rcParams.update({'font.size': 33})

%matplotlib qt

# Data Wrangling

In [2]:
# Read files from csv
patient = pd.read_csv(dir_read + 'patient.csv')
diagnosis = pd.read_csv(dir_read + 'diagnosis.csv')
ptusid_pos = pd.read_csv(dir_read + 'pos_ptunitstayid.csv')
hr = pd.read_csv(dir_read + 'vitalPeriodic_heartrate_first24.csv')
hr = hr.set_index('patientunitstayid')

print('There are ' + str(patient.shape[0]) + ' patientunitstayids in "patient.csv" but only ' \
      + str(hr.shape[0]) + ' patientunitstayids in the hr csv file')
print('Cleaning that up for patient, diagnosis, and ptusid_pos.')
patient = patient[patient['patientunitstayid'].isin(list(hr.index))]
diagnosis = diagnosis[diagnosis['patientunitstayid'].isin(list(hr.index))]
ptusid_pos = ptusid_pos[ptusid_pos['patientunitstayid'].isin(list(hr.index))]

hr = hr.reset_index()

There are 161543 patientunitstayids in "patient.csv" but only 158784 patientunitstayids in the hr csv file
Cleaning that up for patient, diagnosis, and ptusid_pos.


In [3]:
# Create dataframe of patientunitstayids in negative class
tmp = patient[~patient['patientunitstayid'].isin(ptusid_pos['patientunitstayid'])]
ptusid_neg = tmp[['patientunitstayid']]

# Create labels
ptusid_neg = ptusid_neg.assign(label=0)
ptusid_pos = ptusid_pos.assign(label=1)

# Add labels to "patient" dataframe and move 'label' column to left
ptusid_w_label = pd.concat([ptusid_pos, ptusid_neg])
ptusid_w_label.shape[0]
ptusid_w_label = ptusid_w_label.set_index('patientunitstayid')
patient = patient.set_index('patientunitstayid')
patient = patient.join(ptusid_w_label)

# Move 'label' column to left
cols = list(patient.columns)
cols = [cols[-1]] + cols[:-1]
patient = patient[cols]

# Add BMI column to patient
patient = patient.assign(bmi=lambda x: x.admissionweight/((x.admissionheight/100)**2))

# Remove patients who weigh less than 40 kg or more than 300 kg
patient = patient[(patient['admissionweight'] > 40) & \
                   (patient['admissionweight'] < 300)]

# Remove patients who are shorter than 100 cm (3.2 ft) or taller than 230 cm (~7.5 ft)
patient = patient[(patient['admissionheight'] > 100 ) & (patient['admissionheight'] < 230)]

# Remove patients with BMI > 100
patient = patient[patient['bmi'] < 100]

# Dummy variables for gender, ethnicity, unitstaytype
patient = pd.concat([patient, pd.get_dummies(patient['gender'], prefix='gender')], axis=1)
patient = pd.concat([patient, pd.get_dummies(patient['ethnicity'], prefix='ethnicity')], axis=1)
patient = pd.concat([patient, pd.get_dummies(patient['unitstaytype'], prefix='unitstaytype')], axis=1)

nstds = 5
print(str(169+nstds*patient['admissionheight'].std()))
print(str(169-nstds*patient['admissionheight'].std()))

patient_pos = patient[patient['label']==1]
patient_neg = patient[patient['label']==0]
patient[patient['label']==1].shape[0]

# Drop unneeded columns of patient
patient = patient.drop(columns=['hospitalid', 'wardid', 'dischargeweight', 'unitdischargetime24', \
                                'unitdischargeoffset', 'unitdischargelocation', 'hospitaldischargeyear',\
                               'hospitaldischargetime24', 'hospitaldischargelocation'])

223.22514749375495
114.77485250624505


In [4]:
patient.shape

(156992, 33)

In [5]:
# ApachePredVar
u_ptunitstayid = list(patient.index)
apachepredvar = pd.read_csv('/Users/rachellehorwitz/Documents/ViTalErt/data/over18_eicu/apachePredVar.csv')
apachepredvar = apachepredvar[apachepredvar['patientunitstayid'].isin(u_ptunitstayid)]

# Drop columns based on apachepredvar_allcols.html
apachepredvar = apachepredvar.drop(columns=['apachepredvarid', 'sicuday', 'saps3day1', 'saps3yesterday', 'saps3today', 'gender', \
                                    'teachtype', 'region', 'amilocation', 'bedcount', 'admitsource', 'graftcount', 'age', \
                                   'meds', 'diedinhospital', 'electivesurgery', 'readmit', 'managementsystem', 'var03hspxlos', \
                                   'ejectfx', 'day1meds', 'day1verbal', 'day1motor', 'day1eyes',  'day1pao2', 'day1fio2', \
                                           'ventday1', 'creatinine', 'dischargelocation', 'pao2', 'fio2']) # pao2 and fio2 are <0 too frequently
apachepredvar = apachepredvar.drop(columns=['admitdiagnosis'])
apachepredvar = apachepredvar[apachepredvar['verbal']!=-1] # Get rid of missing data
apachepredvar.set_index('patientunitstayid')

Unnamed: 0_level_0,verbal,motor,eyes,thrombolytics,aids,hepaticfailure,lymphoma,metastaticcancer,leukemia,immunosuppression,cirrhosis,activetx,ima,midur,oobventday1,oobintubday1,diabetes,visitnumber
patientunitstayid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
141168,5,6,4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
141194,4,6,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
141197,5,6,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
141203,1,3,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1
141208,5,6,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353201,5,6,3,0,0,0,0,0,0,0,0,1,0,0,1,1,0,3
3353216,1,5,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1
3353235,5,6,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3353251,1,1,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1


# Define X and y

In [6]:
Xy = patient[['uniquepid', 'label',  'age', 'admissionweight', 'admissionheight', 'bmi', 'gender_Female',
       'ethnicity_African American', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic',
       'ethnicity_Native American', 'ethnicity_Other/Unknown']]
Xy = Xy.merge(apachepredvar, how='right', left_on='patientunitstayid', right_on='patientunitstayid')
Xy = Xy.merge(hr, how='inner', left_on='patientunitstayid', right_on='patientunitstayid')

# Arrange by categorical & continuous variables
vars_categ = ['gender_Female', 'ethnicity_African American', 'ethnicity_Asian', 'ethnicity_Caucasian', \
             'ethnicity_Hispanic', 'ethnicity_Native American', 'ethnicity_Other/Unknown',\
             'thrombolytics', 'aids', 'hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia', \
             'immunosuppression', 'cirrhosis', 'activetx', 'ima', 'midur',
             'oobventday1', 'oobintubday1', 'diabetes']
vars_cont = ['age', 'admissionweight', 'admissionheight', 'bmi', \
             'verbal', 'motor', 'eyes', 'visitnumber', 'heartrate']
Xy = pd.concat([Xy[['patientunitstayid', 'uniquepid', 'label']], Xy[vars_cont], Xy[vars_categ]], axis=1)

In [7]:
Xy.head()

Unnamed: 0,patientunitstayid,uniquepid,label,age,admissionweight,admissionheight,bmi,verbal,motor,eyes,visitnumber,heartrate,gender_Female,ethnicity_African American,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown,thrombolytics,aids,hepaticfailure,lymphoma,metastaticcancer,leukemia,immunosuppression,cirrhosis,activetx,ima,midur,oobventday1,oobintubday1,diabetes
0,141168,002-34851,0,70,84.3,152.4,36.295906,5,6,4,1,125.05283,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,141194,002-5276,0,68,73.9,180.3,22.732803,4,6,3,1,86.860627,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,141197,002-37665,0,71,102.1,162.6,38.617545,5,6,4,1,97.307692,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,141203,002-23234,0,77,70.2,160.0,27.421875,1,3,1,1,91.543554,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
4,141208,002-34521,0,25,95.3,172.7,31.952749,5,6,3,1,77.81746,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# Describe postive and negative classes
print('Columns:')
print(Xy.columns)
print('--------------------------')
print('Total: ' + str(Xy.shape[0]) + ' unique patientunitstayids from ' + \
     str(Xy.groupby('uniquepid')['patientunitstayid'].count().shape[0]) + ' unique patients')
print('Negative class: ' + str(Xy[Xy['label']==0].shape[0]) + ' unique patientunitstayids from ' + \
     str(Xy[Xy['label']==0].groupby('uniquepid')['patientunitstayid'].count().shape[0]) + ' unique patients')
print('Positive class: ' + str(Xy[Xy['label']==1].shape[0]) + ' unique patientunitstayids from ' + \
     str(Xy[Xy['label']==1].groupby('uniquepid')['patientunitstayid'].count().shape[0]) + ' unique patients')


# How many patients are in both classes?
Xy1 = Xy[Xy['label']==1]
Xy0 = Xy[Xy['label']==0]
upid_1 = Xy1['uniquepid'].unique()
upid_1 = list(upid_1)
patients_in_both = Xy0[Xy0['uniquepid'].isin(upid_1)]
print('There are ' + str(len(patients_in_both)) + ' patients in both classes')
print('--------------------------')
# Describe the features
print('There are ' + str(len(vars_categ)) + ' categorical features')
print('There are ' + str(len(vars_cont)) + ' continuous features')

# Separate X and y
Xy = Xy.set_index('patientunitstayid')
y = Xy.pop('label')
X = Xy.copy()
groups = Xy['uniquepid'].astype('category').cat.codes # each uniquepid is now a unique number
X = X.drop(columns='uniquepid', axis=1) # remove uniquepid as a feature because it's a group
X.head()

Columns:
Index(['patientunitstayid', 'uniquepid', 'label', 'age', 'admissionweight',
       'admissionheight', 'bmi', 'verbal', 'motor', 'eyes', 'visitnumber',
       'heartrate', 'gender_Female', 'ethnicity_African American',
       'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic',
       'ethnicity_Native American', 'ethnicity_Other/Unknown', 'thrombolytics',
       'aids', 'hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia',
       'immunosuppression', 'cirrhosis', 'activetx', 'ima', 'midur',
       'oobventday1', 'oobintubday1', 'diabetes'],
      dtype='object')
--------------------------
Total: 146242 unique patientunitstayids from 118016 unique patients
Negative class: 145790 unique patientunitstayids from 117723 unique patients
Positive class: 452 unique patientunitstayids from 447 unique patients
There are 268 patients in both classes
--------------------------
There are 21 categorical features
There are 9 continuous features


Unnamed: 0_level_0,age,admissionweight,admissionheight,bmi,verbal,motor,eyes,visitnumber,heartrate,gender_Female,ethnicity_African American,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown,thrombolytics,aids,hepaticfailure,lymphoma,metastaticcancer,leukemia,immunosuppression,cirrhosis,activetx,ima,midur,oobventday1,oobintubday1,diabetes
patientunitstayid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
141168,70,84.3,152.4,36.295906,5,6,4,1,125.05283,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
141194,68,73.9,180.3,22.732803,4,6,3,1,86.860627,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
141197,71,102.1,162.6,38.617545,5,6,4,1,97.307692,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
141203,77,70.2,160.0,27.421875,1,3,1,1,91.543554,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
141208,25,95.3,172.7,31.952749,5,6,3,1,77.81746,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# Split into 80/20 train&val/test using StratifiedGroupKFold()
cv = StratifiedGroupKFold(n_splits = 5, shuffle=True, random_state=1)
trainval_idx, test_idx = next(cv.split(X, y, groups))
X_trainval = X.iloc[trainval_idx]
y_trainval = y.iloc[trainval_idx]
X_test = X.iloc[test_idx]
y_test = y.iloc[test_idx]

In [10]:
Xy_trainval = pd.concat([y_trainval, X_trainval], axis=1)
Xy_trainval_pos = Xy_trainval[Xy_trainval['label']==1]
Xy_trainval_neg = Xy_trainval[Xy_trainval['label']==0]
X_trainval_pos = Xy_trainval_pos.drop(columns='label')
X_trainval_neg = Xy_trainval_neg.drop(columns='label')

In [11]:
#profile = ProfileReport(Xy_trainval, title='Pandas Profiling Report')
#profile.to_file("Xy_trainval_profilereport.html")

In [None]:
corr_map = Xy_trainval.corr()
corr_map

In [None]:
corrmap_bigcorrs = corr_map[corr_map >0.9]
corr_map.where(corr_map>0.8)

# Plots

In [None]:
import seaborn as sns

X_trainval_pos = X_trainval[y_trainval==1]
X_trainval_neg = X_trainval[y_trainval==0]

fig = plt.figure()
VTE = sns.distplot(X_trainval_pos['admissionweight'], color='red', bins=20)
noVTE = sns.distplot(X_trainval_neg['admissionweight'], color='blue', bins=20)
plt.xlabel('Admission Weight (kg)')
fig.legend(labels=['VTE', 'No VTE'])

fig = plt.figure()
sns.distplot(X_trainval_pos['age'], color='red', bins=20)
sns.distplot(X_trainval_neg['age'], color='blue', bins=20)
fig.legend(labels=['VTE', 'No VTE'])
plt.xlabel('Age (yrs)')

fig = plt.figure()
sns.distplot(X_trainval_pos['bmi'], color='red', bins=20)
sns.distplot(X_trainval_neg['bmi'], color='blue', bins=20)
plt.xlabel('BMI (kg/m^2)')

fig = plt.figure()
sns.distplot(X_trainval_pos['admissionheight'], color='red', bins=20)
sns.distplot(X_trainval_neg['admissionheight'], color='blue', bins=20)
plt.xlabel('Height (cm))')

plt.rcParams.update({'font.size': 40})
sns.set(font_scale = 1.25)
fig = plt.figure()
sns.distplot(X_trainval_pos['heartrate'], color='red', bins=20, hist=False, label='VTE')
sns.distplot(X_trainval_neg['heartrate'], color='blue', bins=20, hist=False, label='No VTE')
plt.xlabel('Average Heart Rate \nDuring First 24 Hours (beats/min)')
plt.ylabel('Density')
plt.tight_layout()

In [None]:
# Table of each variable's average value 
Xy_trainval_categ = pd.concat([Xy_trainval['label'], Xy_trainval[vars_categ]], axis=1)
table = pd.pivot_table(data=Xy_trainval_categ, index=['label'])
tablet = table.transpose()
tablet = tablet.rename(columns = {0: 'No VTE', 1: 'VTE'})
tablet = tablet.rename(index={'activetx': 'Active \n Treatment', 'metastaticcancer': 'Metastatic \n Cancer', 'oobventday1': 'Ventilated'})
tablet['VTE'] = tablet['VTE']
tablet['No VTE'] = tablet['No VTE']
tablet

In [None]:
# Scatter plot of some of the features
# Randomly sample 200 data points from each class
nsamp = 200
Xy_pos_nsamp = Xy_trainval[Xy_trainval['label']==1].sample(n = nsamp)
Xy_neg_nsamp = Xy_trainval[Xy_trainval['label']==0].sample(n = nsamp)
Xy_nsamp = pd.concat([Xy_pos_nsamp, Xy_neg_nsamp])
vars_compare = ['age', 'admissionweight', 'admissionheight', 'bmi', 'heartrate']


In [None]:
cols = list(Xy_nsamp.columns)
cols_new = ['label', 'age', 'admissionweight', 'admissionheight', 'bmi', 'heartrate']
Xy_nsamp_cont = Xy_nsamp[cols_new]


In [None]:
ax = sns.pairplot(Xy_nsamp_cont, hue='label')