# Settings and Imports

In [1]:
write_over_18 = 0 # set to 1 if you want to write files for patients over 18
write_filtered02 = 0 # set to 1 if you want to write files for patients who meet the following criteria:
    # have age and admissionweight
    # over 18
    # Alive after 48 hours of ICU admission
    # Known discharge status
    # NOT admitted with VTE
write_filtered03 = 1 # set to 1 if you want to write files for the patients who meet the following criteria:
    # have age and admissionweight
    # over 18
    # Alive after 24 hours of ICU admission
    # Known discharge status
    # NOT admitted with VTE
do_eda_admitdx = 0
time_thresh = 24*60

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import glob, os
import textwrap

# Remove patients 18 & under

In [3]:
# EICU
import pandas as pd
pd.set_option('display.max_colwidth', None)
dir_data_raw = "/Users/rachellehorwitz/Documents/ViTalErt/data/raw_eicu/"
dir_data_over18 = "/Users/rachellehorwitz/Documents/ViTalErt/data/over18_eicu/"
dir_data_write = "/Users/rachellehorwitz/Documents/ViTalErt/data/filtered01/"
# admissiondx = pd.read_csv(dir_data_raw + 'admissionDx.csv')
# apachePredVar = pd.read_csv(dir_data_raw + 'apachePredVar.csv')
# diagnosis = pd.read_csv(dir_data_raw + 'diagnosis.csv')
# customLab = pd.read_csv(dir_data_raw + 'customLab.csv')

## Remove patients with age=NaN or admissionweight=NaN

In [4]:
# Get patientUnitStayIds of patients who are >18 years old
# everything except hospital.csv can be indexed by patientUnitStayID
# patient.csv has age
import glob, os
import pandas as pd
import numpy as np

# Read raw patient.csv file
patient_raw = pd.read_csv(dir_data_raw + 'patient.csv')
print(str(patient_raw['patientunitstayid'].unique().shape[0]) + ' u patientunitidstays read from raw csv')

# Remove patients with age and/or admissionweight NaN
u_ptunitstayid_ageweight_na = patient_raw[(patient_raw['age'].isnull()) | \
                       (patient_raw['admissionweight'].isnull())]['patientunitstayid'].unique()
patient = patient_raw[~patient_raw['patientunitstayid'].isin(u_ptunitstayid_ageweight_na)]
print(str(patient.shape[0]) + ' patientunitstayids with age & admissionweight not na')

200859 u patientunitidstays read from raw csv
184117 patientunitstayids with age & admissionweight not na


## Change age '> 89'  to '91' and convert to int

In [5]:
# Patients with age > 89 do not have their actual age. They are just listed as '> 89'
# In MIMIC, it was known that the median age of patients > 89 was 91. Use that value here
print(str(patient[patient['age']=='> 89'].shape[0]) + ' patientunitstayids with age > 89. Changed to 91')
patient = patient.assign(age=patient.age.replace('> 89', '91').astype(int))
print(str(patient[~(patient['age'] > 18)].shape[0]) + ' patients <=18 yo -- to remove')

# Keep track of patients removing
ptunitstayidG18_remove = patient.loc[~(patient['age'] > 18)]['patientunitstayid']
ptunitstayidG18_remove = np.union1d(np.array(ptunitstayidG18_remove.unique()), \
                                    np.array(u_ptunitstayid_ageweight_na))

# Remove patients
patient = patient[~patient['patientunitstayid'].isin(ptunitstayidG18_remove)]
print(str(patient['patientunitstayid'].unique().size) + ' u patientunitidstays remaining. \n' \
     '\t age >18, have admissionweight')

# Write patients

# patientunitidstays to keep
patientUnitStayIdKeep = patient['patientunitstayid'].unique()
print('patientUnitStayIdKeep has ' + str(patientUnitStayIdKeep.size) + ' elements')

6458 patientunitstayids with age > 89. Changed to 91
969 patients <=18 yo -- to remove
183148 u patientunitidstays remaining. 
	 age >18, have admissionweight
patientUnitStayIdKeep has 183148 elements


## For each csv file except hospital, keep only data for patients >18 who have admissionweight

In [6]:
dir_write_to = dir_data_over18 #'/Users/rachellehorwitz/Documents/tmp/' #

# First, write patients.csv
patient.to_csv(dir_write_to + 'patient.csv', index=False)

# Write other files
import sys
sys.path.insert(0, '/Users/rachellehorwitz/Documents/ViTalErt/src')

# Get list of unique patientunitstayids to keep
patientUnitStayIdKeep = patient['patientunitstayid'].unique()

# # Get list of csvfiles to modify
# csvfiles = []
# for file in glob.glob(dir_data_raw + '*.csv'):
#     csvfiles.append(file)
#csvfiles.remove(dir_data_raw + 'patient.csv')

csvfiles = [dir_data_raw + 'diagnosis.csv', dir_data_raw + 'admissiondx.csv', dir_data_raw + 'lab.csv']

# If specified to write files, keep ONLY patientunitstayids for patients >18 and have admissionweight
if write_over_18 == 1:
    from rewrite_patientsOver18 import *
    
    #keepPtsOver18(csvfiles, patientUnitStayIdKeep, dir_write_to, dir_data_raw)
    for csvfile in csvfiles:
        print('------')
        print('Working on ' + csvfile)
        data_raw = pd.read_csv(csvfile)
        data_keep = data_raw[data_raw['patientunitstayid'].isin(patientUnitStayIdKeep)]
        base = os.path.basename(csvfile)
        print(base)
        newFileNameWithPath = dir_data_over18 + base
        data_keep.to_csv(newFileNameWithPath, index=False)
        print('Saved data to ' + newFileNameWithPath)
        print('------')
print('----------------------------------------')
print('FINISHED!')

----------------------------------------
FINISHED!


# Eliminate Patients Who Died Too Soon Or Unkown Discharge Status

In [7]:
# Eliminate patientunitstayids with NaN unitdischargestatus
dir_read = dir_data_over18
patient = pd.read_csv(dir_data_over18 + 'patient.csv')
patient_nandischarge = patient[patient['unitdischargestatus'].isnull()]
print(patient['unitdischargestatus'].unique())
print(str(patient_nandischarge.shape[0]) + ' patientunitstayids with nan unitdischargestatus')
ptunitstayid_nandischarge = patient_nandischarge['patientunitstayid'].unique()
ptunitstayidG18_remove = np.union1d(ptunitstayidG18_remove, ptunitstayid_nandischarge)
print('ptunitstayidG18_remove currently has ' + \
      str(np.unique(ptunitstayidG18_remove).shape[0]) + ' u patientunitstayids')

['Expired' 'Alive' nan]
28 patientunitstayids with nan unitdischargestatus
ptunitstayidG18_remove currently has 17739 u patientunitstayids


In [8]:
# Eliminate patientunitstayids for patients who died within 48 hours of admittance to ICU
deathThreshHrs = 24
patient_diedTooSoon = patient[(patient['unitdischargestatus']=='Expired') \
                     & (patient['unitdischargeoffset'] < deathThreshHrs*60)]
print(str(patient_diedTooSoon.shape[0]) + ' patientunitstayids died within ' \
      +  str(deathThreshHrs) + ' hours of admission to ICU')
patient_diedTooSoon = patient_diedTooSoon['patientunitstayid'].unique()
print('ptunitstayidG18_remove currently has ' + str(np.unique(ptunitstayidG18_remove).shape[0]) + ' u patientunitstayids')
print(str(patient_diedTooSoon.shape[0]) \
      + ' patientunitstayids will be removed because of NaN dischargestatus' \
                + ' or died within ' + str(deathThreshHrs) + ' hours of ICU admission')

# Add patients who died to soon or have NaN discharge status to list of patients to remove
ptunitstayidG18_remove = np.union1d(ptunitstayidG18_remove, patient_diedTooSoon)
# patient = patient[~patient['patientunitstayid'].isin(ptunitstayidG18_remove)]
# print(str(patient.shape[0]) + ' patientunitstayids remaining')
patient = patient[~patient['patientunitstayid'].isin(ptunitstayidG18_remove)]
print(str(ptunitstayidG18_remove.shape[0]) + ' total patients SO FAR have been removed') 
print(str(patient['patientunitstayid'].unique().shape[0]) + ' patientunitstayids remaining')

3278 patientunitstayids died within 24 hours of admission to ICU
ptunitstayidG18_remove currently has 17739 u patientunitstayids
3278 patientunitstayids will be removed because of NaN dischargestatus or died within 24 hours of ICU admission
21017 total patients SO FAR have been removed
179842 patientunitstayids remaining


# Eliminate patients with NaN admissiondx OR have DVT/PE at admission

In [9]:
admissiondx = pd.read_csv(dir_data_over18 + 'admissiondx.csv')
print(str(admissiondx['patientunitstayid'].unique().shape[0]) + \
      ' u patientunitstayids in admissiondx_over18')

171055 u patientunitstayids in admissiondx_over18


In [10]:
admissiondx.head(30)

Unnamed: 0,admissiondxid,patientunitstayid,admitdxenteredoffset,admitdxpath,admitdxname,admitdxtext
0,8023778,2900217,62,admission diagnosis|Operative Organ Systems|Organ System|Cardiovascular,Cardiovascular,Cardiovascular
1,8023777,2900217,62,admission diagnosis|Was the patient admitted from the O.R. or went to the O.R. within 4 hours of admission?|Yes,Yes,Yes
2,8023779,2900217,62,admission diagnosis|All Diagnosis|Operative|Diagnosis|Cardiovascular|Aortic and Mitral valve replacement,Aortic and Mitral valve replacement,Aortic and Mitral valve replacement
3,7929318,2900240,53,admission diagnosis|Non-operative Organ Systems|Organ System|Gastrointestinal,Gastrointestinal,Gastrointestinal
4,7929317,2900240,53,admission diagnosis|Was the patient admitted from the O.R. or went to the O.R. within 4 hours of admission?|No,No,No
5,7929319,2900240,53,"admission diagnosis|All Diagnosis|Non-operative|Diagnosis|Gastrointestinal|Bleeding, GI-location unknown","Bleeding, GI-location unknown","Bleeding, GI-location unknown"
6,7912438,2900262,6,"admission diagnosis|All Diagnosis|Non-operative|Diagnosis|Cardiovascular|Infarction, acute myocardial (MI)","Infarction, acute myocardial (MI)","Infarction, acute myocardial (MI)"
7,7912437,2900262,6,admission diagnosis|Non-operative Organ Systems|Organ System|Cardiovascular,Cardiovascular,Cardiovascular
8,7912439,2900262,6,admission diagnosis|Additional APACHE Information|Acute MI location|anterior,anterior,anterior
9,7912440,2900262,6,admission diagnosis|Additional APACHE Information|Thrombolytic Therapy received within 24 hours|No,No,No


## Eliminate patients who do not have an admissiondx

In [11]:
# Eliminate patients who died too soon or have unknown discharge status
admissiondx = admissiondx[~admissiondx['patientunitstayid'].isin(ptunitstayidG18_remove)]
print(str(admissiondx.shape[0]) + ' admissiondx entries for patients >18,' \
     ' and with ok discharge')
print(str(admissiondx['patientunitstayid'].unique().shape[0]) + \
      ' unique patientunitstayids in admissionsdx')

# Get patientunitstayids to keep (the ones in both admissiondx_okdischarge AND patient)
ptunitstayid_keep = list(set(patient['patientunitstayid']) &\
                                 set(admissiondx['patientunitstayid'].unique()))
ptunitstayid_keep = np.array(ptunitstayid_keep)
print(str(int(np.unique(ptunitstayid_keep.shape[0]))) + ' unique patientunitstayids who meet previous critera and' \
     ' have admissionsdx')

# Add to ptunitstayidG18_remove
ptunitstayid_nodx = list(set(patient['patientunitstayid'].unique()) - \
                         set(admissiondx['patientunitstayid'].unique()))
ptunitstayid_nodx = np.array(ptunitstayid_nodx)
print(str(np.unique(ptunitstayid_nodx).shape[0]) + ' u patientunitstayids that don''t have admissiondx')
ptunitstayidG18_remove = np.union1d(ptunitstayidG18_remove, ptunitstayid_nodx)
print(str(np.unique(ptunitstayidG18_remove).shape[0]) + ' u patientstayids who' \
     ' have unknown discharge status, died too early, and/or do not have admissiondx')

# Remove patientunitstayids
patient = patient[~patient['patientunitstayid'].isin(ptunitstayidG18_remove)]
admissiondx = admissiondx[~admissiondx['patientunitstayid'].isin(ptunitstayidG18_remove)]
print(str(admissiondx['patientunitstayid'].unique().shape[0]) + ' u patientunitidstays remaining in admissiondx')
print(str(patient['patientunitstayid'].unique().shape[0]) + ' u patientunitidstays remaining in patient')

592405 admissiondx entries for patients >18, and with ok discharge
167985 unique patientunitstayids in admissionsdx
167985 unique patientunitstayids who meet previous critera and have admissionsdx
11857 u patientunitstayids that dont have admissiondx
32874 u patientstayids who have unknown discharge status, died too early, and/or do not have admissiondx
167985 u patientunitidstays remaining in admissiondx
167985 u patientunitidstays remaining in patient


## Eliminate patients diagnosed with DVT/PE at admission

### Eliminate patients diagnosed with DVT at admission

In [12]:
# Eliminate patientunitstayids who were diagnosed with DVT upon admission to ICU
dvt_admissiondx = admissiondx[admissiondx['admitdxname'].str.contains('Thrombosis, vascular', case=False)]
ptunitstayid_dvt_admission = dvt_admissiondx['patientunitstayid'].unique()
print(str(ptunitstayid_dvt_admission.shape[0]) + ' patientunitstayids were diagnosed with "Thrombosis, vascular" upon admission')
ptunitstayidG18_remove = np.union1d(ptunitstayidG18_remove, ptunitstayid_dvt_admission)
patient = patient[~patient['patientunitstayid'].isin(ptunitstayidG18_remove)]
admissiondx = admissiondx[~admissiondx['patientunitstayid'].isin(ptunitstayidG18_remove)]

# Take union(patientunitstayids diagnosed with DVT upon admission to ICU ||
#.           patientunitstayids diagnosed with DVT shortly after ICU admission (within time_thresh))
#ptunitstayid_dvt_too_early = np.union1d(u_vte_lessThanTime_patientunitstayid, ptunitstayid_dvt_admission)
#ptunitstayid_dvt_too_early.shape[0]
#print(str(ptunitstayid_dvt_too_early.shape[0]) + ' patientunitstayids had "Thrombosis, vascular" upon admission \n and/or diagnosed with VTE within timeThresh')

474 patientunitstayids were diagnosed with "Thrombosis, vascular" upon admission


### EDA on types of admission diagnoses 

In [13]:
if do_eda_admitdx == 1:
    admissiondx_eval = admissiondx.copy()

In [14]:
if do_eda_admitdx == 1:
    n = 30
    admissiondx_eval['admitdxpath'].value_counts()[:n].to_dict()

In [15]:
if do_eda_admitdx == 1:
    admissiondx_eval[(admissiondx_eval['admitdxpath'].str.contains('pulmonary')) \
                     & ~(admissiondx_eval['admitdxpath'].str.contains('Sepsis')) \
                    & ~(admissiondx_eval['admitdxpath'].str.contains('ARDS')) \
                    & ~(admissiondx_eval['admitdxpath'].str.contains('Hemorrhage')) \
                    & ~(admissiondx_eval['admitdxpath'].str.contains('Restrictive lung disease'))]

In [16]:
if do_eda_admitdx == 1:
    admissiondx_eval[admissiondx_eval['admitdxpath'].str.contains('Embolus, pulmonary', case=False)]

### Eliminate patients diagnosed with PE at admission

In [17]:
# Get patientunitidstays with PE at admission
admissiondx_no_pe = admissiondx[~admissiondx['admitdxpath'].str.contains('Embolus, pulmonary', case=False)]
admissiondx_pe = admissiondx[admissiondx['admitdxpath'].str.contains('Embolus, pulmonary', case=False)]
ptunitstayids_no_pe = admissiondx_no_pe['patientunitstayid'].unique()
ptunitstayids_pe = admissiondx_pe['patientunitstayid'].unique()
print(str(ptunitstayids_pe.shape[0]) + ' u patientunitstayids with PE at admission that will be removed')

# Add new patientunitidstays with PE to the list of patientunitidstays to remove
ptunitstayidG18_remove = np.union1d(ptunitstayidG18_remove, ptunitstayids_pe)

# Remove patientunitidstays from admissiondx and patient
admissiondx = admissiondx[~admissiondx['patientunitstayid'].isin(ptunitstayidG18_remove)]
patient = patient[~patient['patientunitstayid'].isin(ptunitstayidG18_remove)]
print(str(patient['patientunitstayid'].unique().shape[0]) + ' u patientunitstayids remaining in "patient"')
print(str(admissiondx['patientunitstayid'].unique().shape[0]) + ' u patientunitstayids remaining in "admissiondx"')

1612 u patientunitstayids with PE at admission that will be removed
165899 u patientunitstayids remaining in "patient"
165899 u patientunitstayids remaining in "admissiondx"


# Get Patients Diagnosed with VTE in ICU

In [18]:
diagnosis = pd.read_csv(dir_data_over18 + 'diagnosis.csv')

# Remove patientunitstayids from diagnosis that we already know we're removing
diagnosis = diagnosis[~diagnosis['patientunitstayid'].isin(ptunitstayidG18_remove)]
print('After removing patientunitstayids that were removed from "patient.csv" and "admitdx.csv",' \
     + str(diagnosis['patientunitstayid'].unique().shape[0]) + ' u patientunitstayids \n and ' \
      + str(diagnosis.shape[0]) + ' rows are left in diagnosis.csv')
diagnosis.head()

# # Add ICD9 codes for diseases that clearly have them
# # diabetes melitus --> 250.xx
# diagnosis.loc[(diagnosis.diagnosisstring.str.contains('diabetes')) & \
#               (pd.isnull(diagnosis.icd9code)), 'icd9code']= '250.xx'
# # cardiomyopathy --> 425.xx
# diagnosis.loc[(diagnosis.diagnosisstring.str.contains('cardiomyopathy')) & \
#               (pd.isnull(diagnosis.icd9code)), 'icd9code']= '425.xx'

# Import ICD9 codes for VTE
from icd9codes import *
icd9_vte = getVTEicd9codes()
print(icd9_vte)


After removing patientunitstayids that were removed from "patient.csv" and "admitdx.csv",159567 u patientunitstayids 
 and 2576098 rows are left in diagnosis.csv
['415.1', '415.10', '415.11', '415.13', '415.19', '451.0', '451.1', '451.10', '451.11', '451.19', '451.2', '451.20', '451.81', '451.89', '451.9', '451.90', '453.1', '453.10', '453.2', '453.20', '453.4', '453.40', '453.41', '453.42', '453.6', '453.60', '453.8', '453.80', '453.89', '453.9', '453.90']


In [19]:
# Find patientunitstayIDs in pos and neg classes
# NON-NULL ICD9 CODES: find those diagnoses with VTE within 48 hours of ICU admission and remove them 
diagnosis_hasicd9 = diagnosis[~diagnosis['icd9code'].isnull()]
diagnosis_hasicd9_vte = pd.DataFrame()

for code in icd9_vte:
    print(str(code))
    rows = diagnosis_hasicd9[diagnosis_hasicd9['icd9code'].str.contains(code)]
    diagnosis_hasicd9_vte = pd.concat([rows, diagnosis_hasicd9_vte])
    diagnosis_hasicd9_vte = diagnosis_hasicd9_vte.loc[~diagnosis_hasicd9_vte.index.duplicated(keep='first')]

idx_diagnosis_hasicd9_vte =  diagnosis_hasicd9_vte.index   
u_ptunitstayid_diagnosis_hasicd9_vte = diagnosis_hasicd9_vte['patientunitstayid'].unique()
print(str(u_ptunitstayid_diagnosis_hasicd9_vte.shape[0]) + ' unique patientunitstayids with ICD9 and VTE')

# Find patientunitstayids that have DVT in name
diagnosis_nullicd9 = diagnosis[diagnosis['icd9code'].isnull()]
diagnosis_DVT_nullicd9 = diagnosis_nullicd9[diagnosis_nullicd9['diagnosisstring'].str.contains('DVT')]
idx_diagnosis_DVT_nullicd9 = diagnosis_DVT_nullicd9.index
u_ptunitstayid_DVT_nullicd9 = diagnosis_DVT_nullicd9['patientunitstayid'].unique()
print(str(u_ptunitstayid_DVT_nullicd9.shape[0]) + ' unique patientunitstayids with NULL icd9 AND DVT')

# Take union to get patientunitstayids that have VTE
idx_VTE = idx_diagnosis_hasicd9_vte.union(idx_diagnosis_DVT_nullicd9)
ptunitstayid_vte = np.union1d(u_ptunitstayid_diagnosis_hasicd9_vte, u_ptunitstayid_DVT_nullicd9)
print(str(np.count_nonzero(np.unique(ptunitstayid_vte))) + ' u patientunitidstays with VTE diagnosed ever')
ptunitstayid_vte_series = pd.Series(ptunitstayid_vte) # make it a series
ptunitstayid_vte_series.name = 'patientunitstayid'


diagnosis_vte_ever = diagnosis.loc[diagnosis['patientunitstayid'].isin(ptunitstayid_vte)]

415.1
415.10
415.11
415.13
415.19
451.0
451.1
451.10
451.11
451.19
451.2
451.20
451.81
451.89
451.9
451.90
453.1
453.10
453.2
453.20
453.4
453.40
453.41
453.42
453.6
453.60
453.8
453.80
453.89
453.9
453.90
1549 unique patientunitstayids with ICD9 and VTE
626 unique patientunitstayids with NULL icd9 AND DVT
2028 u patientunitidstays with VTE diagnosed ever


In [20]:
if write_filtered03 == 1:
    dir_write_filtered = '/Users/rachellehorwitz/Documents/ViTalErt/data/filtered03/'
    ptunitstayid_vte_series.to_csv(dir_write_filtered + 'pos_ptunitstayid.csv', index=False)
    patient.to_csv(dir_write_filtered + 'patient.csv', index=False)
    diagnosis.to_csv(dir_write_filtered + 'diagnosis.csv', index=False)
    admissiondx.to_csv(dir_write_filtered + 'admissiondx.csv', index=False)

In [21]:
patient['patientunitstayid'].unique().shape[0]

165899

In [22]:
(184117 - 1987)

182130

# Do NOT run code below this line

In [None]:
print(ptunitstayid_vte_series)

In [None]:
ptunitstayid_vte_series.columns

In [None]:
diagnosis_vte_ever = diagnosis.loc[diagnosis['patientunitstayid'].isin(ptunitstayid_vte)]

In [None]:
diagnosis_vte_ever.head(30)

In [None]:
idx_VTE = idx_diagnosis_hasicd9_vte.union(idx_diagnosis_DVT_nullicd9)
idx_VTE
#diagnosis.iloc[idx_VTE]

In [None]:
diagnosis.index.max

In [None]:
idx_diagnosis_hasicd9_vte =  diagnosis_hasicd9_vte.index  
idx_diagnosis_hasicd9_vte

In [None]:
# diagnosis_vte_ever           = diagnosis[diagnosis['patientunitstayid'].isin(ptunitstayid_vte)]
# diagnosis_no_vte             = diagnosis[~diagnosis['patientunitstayid'].isin(ptunitstayid_vte)]
# patient_vte_ever             = patient[patient['patientunitstayid'].isin(ptunitstayid_vte)]
# patient_no_vte_ever          = patient[~patient['patientunitstayid'].isin(ptunitstayid_vte)]
# admissiondx_vte_ever         = diagnosis[diagnosis['patientunitstayid'].isin(ptunitstayid_vte)]
# admissiondx_no_vte_ever      = diagnosis[~diagnosis['patientunitstayid'].isin(ptunitstayid_vte)]

# diagnosis_vte_ever.to_csv(dir_data_write + 'diagnosis_vte_ever.csv', index=False)
# diagnosis_no_vte.to_csv(dir_data_write + 'diagnosis_no_vte.csv', index=False)
# patient_vte_ever.to_csv(dir_data_write + 'patient_vte_ever.csv', index=False)
# patient_no_vte_ever.to_csv(dir_data_write + 'patient_no_vte_ever.csv', index=False)
# admissiondx_vte_ever.to_csv(dir_data_write + 'admissiondx_vte_ever.csv', index=False)
# admissiondx_no_vte_ever.to_csv(dir_data_write + 'admissiondx_no_vte_ever.csv', index=False)

In [None]:
diagnosis_index_vte_ever = diagnosis.loc[diagnosis['patientunitstayid'].isin(ptunitstayid_vte)].index

In [None]:
diagnosis_index_vte_ever

In [None]:
diagnosis_vte_ever.head(10)

In [None]:
# diagnosis_hasicd9_pos = diagnosis_hasicd9_vte[diagnosis_hasicd9_vte['diagnosisoffset'] > time_thresh]
# print(str(diagnosis_hasicd9_pos['patientunitstayid'].unique().shape[0]) + ' unique patientunitstayids that' \
#      + ' meet inclusion/exclusion criteria for age, discharge, admitdx AND have ICD9 AND \n' \
#       + ' VTE diagnosed AFTER ' + str(time_thresh) + ' min of ICU admit')


remove_ptunitstayid_dxtooearly = diagnosis[diagnosis['diagnosisoffset'] <= time_thresh, 'patientunitstayid']
print(str(remove_ptunitstayid_dxtooearly['patientunitstayid'].unique().shape[0]) + ' diagnosed with VTE at time < ' + str(time_thresh))

In [None]:

diagnosis_hasicd9_vte['diagnosisoffset_hrs'] = diagnosis_hasicd9_vte['diagnosisoffset']/60
diagnosis_hasicd9_vte['diagnosisoffset_hrs'].hist(bins=100)
plt.xlabel('Time in ICU (hrs)')
plt.ylabel('Number of unique patientunitstayids')

In [None]:
277/(277+162997)

In [None]:
# # Re-write csv files, elimiating ptunitstayid_dvt_too_early
# def eliminatePtsWithVTETooEarly(csvfiles, ptunitstayid_eliminate, dir_write, dir_read):
#   import pandas as pd
#   import os
#   #csvfiles.remove(dir_data_raw + 'hospital.csv') # no patientunitstayid
#   for csvfile in csvfiles:
#       print('------')
#       print('Working on ' + csvfile)
#       data_in = pd.read_csv(csvfile)
#       data_out = data_in[~data_in['patientunitstayid'].isin(ptunitstayid_dvt_too_early)]
#       base = os.path.basename(csvfile)
#       print(base)
#       newFileNameWithPath = dir_write + base
#       data_out.to_csv(newFileNameWithPath, index=False)
#       print('Saved data to ' + newFileNameWithPath)
#       print('------')

# csvfiles = []
# for file in glob.glob(dir_data_over18 + '*.csv'):
#     csvfiles.append(file)
# dir_write = '/Users/rachellehorwitz/Documents/ViTalErt/data/over18_no_vte_tooearly/'
# eliminatePtsWithVTETooEarly(csvfiles, ptunitstayid_dvt_too_early, dir_write, dir_data_over18)

In [None]:
diagnosis_nullicd9 = diagnosis[diagnosis['icd9code'].isnull()]
diagnosis_DVT_nullicd9 = diagnosis_nullicd9[diagnosis_nullicd9['diagnosisstring'].str.contains('DVT')]
u_ptunitstayid_DVT_nullicd9 = diagnosis_DVT_nullicd9['patientunitstayid'].unique()
ptunitstaid_vte = np.union1d(u_ptunitstayid_diagnosis_hasicd9_vte, u_ptunitstayid_DVT_nullicd9)
ptunitstaid_vte.shape[0]

In [None]:
diagnosis_nullicd9[diagnosis_nullicd9['diagnosisstring'].str.contains('PE')]

In [None]:
diagnosis_nullicd9[(diagnosis_nullicd9['diagnosisstring'].str.contains('Thrombo', case=False)) & \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('arter', case=False)) & \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('therapy', case=False))& \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('PTCA', case=False))& \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('thrombolytics given within', case=False)) &\
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('suspected', case=False)) &\
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('antithrombotics', case=False))]

In [None]:
diagnosis_nullicd9[(diagnosis_nullicd9['diagnosisstring'].str.contains('DVT'))]

In [None]:
diagnosis_nullicd9[(diagnosis_nullicd9['diagnosisstring'].str.contains('Thrombo', case=False)) & \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('arter', case=False)) & \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('therapy', case=False))& \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('PTCA', case=False))& \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('thrombolytics given within', case=False)) &\
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('suspected', case=False)) &\
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('antithrombotics', case=False))]

In [None]:
d_ruledout = diagnosis_nullicd9[(diagnosis_nullicd9['diagnosisstring'].str.contains('Embol', case=False)) & \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('suspected', case=False)) & \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('arterial thromboembolism', case=False)) & \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('r/o pulmonary embolism', case=False)) & \
                  (~diagnosis_nullicd9['diagnosisstring'].str.contains('s/p embolectomy', case=False)) ]

In [None]:
d_ruledout['diagnosisstring'].unique()

In [None]:
1302/162997

In [None]:
diagnosis_nullicd9 = diagnosis[diagnosis['icd9code'].isnull()]
print(str(diagnosis_nullicd9['patientunitstayid'].unique().shape[0]) + \
      ' NULL ICD9 u patientunitstayids in diagnosis')

In [None]:
diagnosis.shape[0]

In [None]:
diagnosis_null = diagnosis[diagnosis['icd9code'].isnull()]
print(str(diagnosis_null.shape[0]) + ' diagnosis rows with NULL icd9code')
diagnosis_null.head()

In [None]:
diagnosis = pd.read_csv(dir_data_over18 + 'diagnosis.csv')

# diabetes melitus --> 250.xx
diagnosis.loc[(diagnosis.diagnosisstring.str.contains('diabetes')) & \
              (pd.isnull(diagnosis.icd9code)), 'icd9code']= '250.xx'

# cardiomyopathy --> 425.xx
diagnosis.loc[(diagnosis.diagnosisstring.str.contains('cardiomyopathy')) & \
              (pd.isnull(diagnosis.icd9code)), 'icd9code']= '425.xx'

In [None]:
diagnosis_notnull = diagnosis[~diagnosis['icd9code'].isnull()]
diagnosis_null = diagnosis[diagnosis['icd9code'].isnull()]
print(str(diagnosis_notnull.shape[0]) + ' diagnosis rows with icd9code')
print(str(diagnosis_null.shape[0]) + ' diagnosis rows with NULL icd9code')

In [None]:
diagnosis_null[diagnosis_null['diagnosisstring'].str.contains('pulmonary embolism')]

In [None]:
from icd9codes import *
icd9_vte = getVTEicd9codes()

In [None]:
diagnosis_vte = pd.DataFrame()
for code in icd9_vte:
    print(str(code))
    rows = diagnosis[diagnosis['icd9code'].str.contains(code)]
    diagnosis_vte = pd.concat([rows, diagnosis_vte])
    diagnosis_vte = diagnosis_vte.loc[~diagnosis_vte.index.duplicated(keep='first')]

In [None]:
diagnosis_vte.shape

In [None]:
diagnosis_vte.head(5)

In [None]:
%matplotlib notebook
diagnosis_vte['diagnosisoffset'].hist(bins=500)

In [None]:
import numpy as np
# Don't look at patients who are entering the ICU for PE/DVT, so threshold by time
print('time_thresh is ' + str(time_thresh))
vte_greaterThanTime = diagnosis_vte[diagnosis_vte['diagnosisoffset']>=time_thresh]
vte_lessThanTime = diagnosis_vte[diagnosis_vte['diagnosisoffset']<time_thresh]
print(str(vte_greaterThanTime.shape[0]) + ' visits are included for time_thresh=' + str(time_thresh))
print(str(vte_lessThanTime.shape[0]) + ' visits are eliminated for time_thresh=' + str(time_thresh))

# Look for duplicates in patientunitstayid in vte_greaterThanTime and vte_lessThanTime to see if some people were diagnosed more than once
u_vte_greaterThanTime_patientunitstayid = vte_greaterThanTime['patientunitstayid'].unique()
print(str(u_vte_greaterThanTime_patientunitstayid.shape[0]) + ' unique patientunitstayids in greaterThanTime')
u_vte_lessThanTime_patientunitstayid = vte_lessThanTime['patientunitstayid'].unique()
print(str(u_vte_lessThanTime_patientunitstayid.shape[0]) + ' unique patientunitstayids in lessThanTime')
type(u_vte_lessThanTime_patientunitstayid)
ptunitstayid_intersect = np.intersect1d(u_vte_greaterThanTime_patientunitstayid, u_vte_lessThanTime_patientunitstayid)
print(str(ptunitstayid_intersect.shape[0]) + ' patientunitstayids in both')

# # Eliminate patients who had VTE diagnosed shortly after they were admitted to the ICU
# diagnosis_vte_eliminateLessThan = diagnosis_vte \
#                                 [~diagnosis_vte['patientunitstayid'].isin(u_vte_lessThanTime_patientunitstayid)]
# diagnosis_vte_eliminateLessThan.head(10)

In [None]:
# Eliminate patientunitstayids who were diagnosed with DVT upon admission to ICU
dvt_admissiondx = admissiondx[admissiondx['admitdxname'].str.contains('Thrombosis, vascular', case=False)]
ptunitstayid_dvt_admission = dvt_admissiondx['patientunitstayid'].unique()
print(str(ptunitstayid_dvt_admission.shape[0]) + ' patientunitstayids were diagnosed with "Thrombosis, vascular" upon admission')

# Take union(patientunitstayids diagnosed with DVT upon admission to ICU ||
#.           patientunitstayids diagnosed with DVT shortly after ICU admission (within time_thresh))
ptunitstayid_dvt_too_early = np.union1d(u_vte_lessThanTime_patientunitstayid, ptunitstayid_dvt_admission)
ptunitstayid_dvt_too_early.shape[0]
print(str(ptunitstayid_dvt_too_early.shape[0]) + ' patientunitstayids had "Thrombosis, vascular" upon admission \n and/or diagnosed with VTE within timeThresh')

In [None]:
# Union of ptunitstayids to eliminate
ptunitstayid_toEliminate = [list(ptunitstayid_diedWithinDeathThresh), list(ptunitstayid_dvt_admission), \
         list(ptunitstayid_dvt_too_early), list(ptunitstayid_nandischarge)] # ptunitstayid_intersect is already in ptunitstayid_dvt_tooearly
ptunitstayid_toEliminate = list(set().union(*ptunitstayid_toEliminate))
ptunitstayid_toEliminate = np.array(ptunitstayid_toEliminate)
print(str(ptunitstayid_toEliminate.shape[0]) + ' patientunitstayids to eliminate')
print('hello')

In [None]:
# I may have left some patients in who were admitted with pulmonary embolism. See how many there are
admissiondx_clean1 = admissiondx[~admissiondx['patientunitstayid'].isin(ptunitstayid_toEliminate)]
admissiondx_clean1.head()
admissiondx_clean1['patientunitstayid'].unique().shape[0]
admissiondx_clean1[admissiondx_clean1['admitdxpath'].str.contains('card', case=False)]


After creating a list of patients to eliminate based on the following criteria:
- < 19 years old
- Died within 48 hours of admission to ICU
- Diagnosed with 
- Fourth item

In [None]:
admissiondx_clean1.columns

In [None]:
admissiondx['patientunitstayid'].unique().shape[0]