In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
import seaborn as sns
# for configuring connection 
from configobj import ConfigObj
import os

%matplotlib inline

In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [3]:
from sqlalchemy import create_engine,text
con= create_engine('postgresql://eicu@192.168.60.144:6432/eicu')

In [4]:
datadir = '/home/mei/nas/docker/dataset/EICU/eicu-collaborative-research-database-2.0/'
porcesseddir2 = '/home/mei/nas/docker/processedData_2/'

In [5]:
def round_up(x, base=5):
    return base * round(x/base)

In [6]:
query = query_schema + """
select p.patientunitstayid, p.hospitaladmitOffset,ph.pasthistoryoffset,p.gender, p.age, 
      p.apacheadmissiondx,ph.pasthistorypath,p.unitadmitsource, p.admissionweight,	
      p.dischargeweight, p.unitdischargeoffset, p.unitdischargelocation,	
      p.unitdischargestatus
from pasthistory ph
join patient_2 p
  on ph.patientunitstayid = p.patientunitstayid
  
order by p.patientunitstayid, p.hospitaladmitOffset,ph.pasthistoryoffset
"""

In [7]:
df_past= pd.read_sql_query(query,con)
df_past.head(n=20)

Unnamed: 0,patientunitstayid,hospitaladmitoffset,pasthistoryoffset,gender,age,apacheadmissiondx,pasthistorypath,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
0,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
1,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
2,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
3,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
4,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Past History...,Direct Admit,84.3,85.8,3596,Death,Expired
5,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
6,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
7,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
8,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
9,141168,0,114,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired


In [8]:
df=df_past.copy()
# Set indices
df = df.set_index(['patientunitstayid', 'hospitaladmitoffset','pasthistoryoffset'])
df = df.drop(df.index[np.where(df.index.get_level_values('pasthistoryoffset') < 0)])
df.sort_index(level = ['patientunitstayid','hospitaladmitoffset', 'pasthistoryoffset'], inplace = True)
# Resample every 5 mins
df.rename(round_up, level = 'pasthistoryoffset', inplace = True)
df.head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,age,apacheadmissiondx,pasthistorypath,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
patientunitstayid,hospitaladmitoffset,pasthistoryoffset,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Past History...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,115,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired


In [9]:
df1 = df[df['pasthistorypath'].str.contains('Organ Systems')]
df1 = df1.reset_index()
df1['pasthistorypath'] = df1['pasthistorypath'].str.replace('notes/Progress Notes/Past History/Organ Systems/', ' ')
df1['pasthistorypath'] = df1['pasthistorypath'].str.replace('|', '/')
split = df1['pasthistorypath'].str.split('/')
df1['first'] = split.str[0]
df1['second'] = split.str[1]
df1['third'] = split.str[2]
df1['fourth'] = split.str[3]

df1.head()

Unnamed: 0,patientunitstayid,hospitaladmitoffset,pasthistoryoffset,gender,age,apacheadmissiondx,pasthistorypath,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus,first,second,third,fourth
0,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Pulmonary/COPD/COPD - no limitations,Direct Admit,84.3,85.8,3596,Death,Expired,Pulmonary,COPD,COPD - no limitations,
1,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Valve disease/AS,Direct Admit,84.3,85.8,3596,Death,Expired,Cardiovascular (R),Valve disease,AS,
2,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Congestive Heart Failure/C...,Direct Admit,84.3,85.8,3596,Death,Expired,Cardiovascular (R),Congestive Heart Failure,CHF - class II,
3,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Hypertension Requiring Tre...,Direct Admit,84.3,85.8,3596,Death,Expired,Cardiovascular (R),Hypertension Requiring Treatment,hypertension requiring treatment,
4,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/AICD/AICD,Direct Admit,84.3,85.8,3596,Death,Expired,Cardiovascular (R),AICD,AICD,


In [10]:
columns = df1.columns.tolist()
index = columns.index('pasthistorypath')
new_order = columns[:index + 1] + ['first', 'second', 'third', 'fourth'] + columns[index + 1:-4]
df1 = df1.reindex(columns=new_order)
df1 = df1.set_index(['patientunitstayid', 'hospitaladmitoffset','pasthistoryoffset'])
# df1.sort_index(level = ['patientunitstayid','hospitaladmitoffset', 'pasthistoryoffset'], inplace = True)
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,age,apacheadmissiondx,pasthistorypath,first,second,third,fourth,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
patientunitstayid,hospitaladmitoffset,pasthistoryoffset,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Pulmonary/COPD/COPD - no limitations,Pulmonary,COPD,COPD - no limitations,,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Valve disease/AS,Cardiovascular (R),Valve disease,AS,,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Congestive Heart Failure/C...,Cardiovascular (R),Congestive Heart Failure,CHF - class II,,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Hypertension Requiring Tre...,Cardiovascular (R),Hypertension Requiring Treatment,hypertension requiring treatment,,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/AICD/AICD,Cardiovascular (R),AICD,AICD,,Direct Admit,84.3,85.8,3596,Death,Expired


In [11]:
# df1.to_csv(porcesseddir2 + 'selected_pastHistory.csv',  index=True)

In [12]:
n=len(list(df1.index.get_level_values('patientunitstayid').unique()))
r=len(df1)
print("There are {} unique patientunitstayid in the selected patients with the total {} records.".format(n,r))

There are 53612 unique patientunitstayid in the selected patients with the total 312103 records.


In [13]:
df1_reset = df1.reset_index()
df_unique = df1_reset.drop_duplicates(subset=['patientunitstayid'])
freq = pd.DataFrame()
freq['noAnnotations'] = df_unique['apacheadmissiondx'].value_counts()
freq['%'] = (df_unique['apacheadmissiondx'].value_counts() / len(df_unique)) * 100
freq

Unnamed: 0_level_0,noAnnotations,%
apacheadmissiondx,Unnamed: 1_level_1,Unnamed: 2_level_1
"Sepsis, pulmonary",3972,7.408789
"CVA, cerebrovascular accident/stroke",2923,5.452138
"Infarction, acute myocardial (MI)",2867,5.347683
"CHF, congestive heart failure",2773,5.172349
"Sepsis, renal/UTI (including bladder)",2552,4.760128
...,...,...
"Bleeding-other GI, surgery for",1,0.001865
"Spinal cord surgery, other",1,0.001865
"GI Abscess/cyst-primary, surgery for",1,0.001865
"Biopsy, brain",1,0.001865


In [14]:
# freq.to_csv(porcesseddir2 + 'selected_pastHistory_apacheaddx_freq.csv',  index=True)

## nan 值统计

In [15]:
missing_ratio = pd.DataFrame(df1.isna().sum() / len(df1.index), columns = ['missing ratio %']) * 100
missing_ratio.sort_values('missing ratio %', inplace = True, ascending = False) 
missing_ratio

Unnamed: 0,missing ratio %
fourth,89.413431
dischargeweight,34.968264
admissionweight,1.186788
gender,0.0
age,0.0
apacheadmissiondx,0.0
pasthistorypath,0.0
first,0.0
second,0.0
third,0.0


## frequency of past dx history 

### first

In [16]:
frequency1 = pd.DataFrame()
frequency1['noAnnotations'] = df1['first'].value_counts() 
frequency1['%'] = df1['first'].value_counts() / len(df1.index) * 100
frequency1

Unnamed: 0_level_0,noAnnotations,%
first,Unnamed: 1_level_1,Unnamed: 2_level_1
Cardiovascular (R),144104,46.171937
Endocrine (R),45774,14.666312
Pulmonary,39744,12.734258
Neurologic,27196,8.71379
Hematology,22345,7.159495
Renal (R),17722,5.678254
Gastrointestinal (R),10621,3.403043
Rheumatic,2666,0.854205
Infectious Disease (R),1931,0.618706


In [17]:
frequency2 = pd.DataFrame()
frequency2['noAnnotations'] = df1['second'].value_counts() 
frequency2['%'] = df1['second'].value_counts() / len(df1.index) * 100
frequency2

Unnamed: 0_level_0,noAnnotations,%
second,Unnamed: 1_level_1,Unnamed: 2_level_1
Hypertension Requiring Treatment,56983,18.257755
Oncology (R),22345,7.159495
COPD,19067,6.109201
Congestive Heart Failure,18473,5.918879
Non-Insulin Dependent Diabetes,17760,5.690429
Insulin Dependent Diabetes,16268,5.212382
Arrhythmias,15676,5.022701
Myocardial Infarction,10655,3.413937
Hypothyroidism,10649,3.412015
Strokes,10374,3.323903


In [18]:
frequency3 = pd.DataFrame()
frequency3['noAnnotations'] = df1['third'].value_counts() 
frequency3['%'] = df1['third'].value_counts() / len(df1.index) * 100
frequency3

Unnamed: 0_level_0,noAnnotations,%
third,Unnamed: 1_level_1,Unnamed: 2_level_1
hypertension requiring treatment,56983,18.257755
Cancer,17532,5.617376
insulin dependent diabetes,16268,5.212382
CHF,14251,4.566121
medication dependent,13678,4.382528
...,...,...
Cushing's syndrome,22,0.007049
renal tubular acidosis,21,0.006729
excellent - strenuous exercise (>10 mets),14,0.004486
PS,14,0.004486


In [19]:
frequency4 = pd.DataFrame()
frequency4['noAnnotations'] = df1['fourth'].value_counts() 
frequency4['%'] = df1['fourth'].value_counts() / len(df1.index) * 100
frequency4

Unnamed: 0_level_0,noAnnotations,%
fourth,Unnamed: 1_level_1,Unnamed: 2_level_1
Cancer-Primary Site,12246,3.923705
Metastases,3389,1.085859
multiple,3035,0.972435
Chemotherapy,2274,0.728606
Hematologic Malignancy,1897,0.607812
p AVR,1286,0.412043
Radiation Therapy within past 6 months,1113,0.356613
s,999,0.320087
o GI bleeding,967,0.309834
V paced,796,0.255044


In [20]:
df_past_dx=df1.copy()
frequency_table_1 =df_past_dx.groupby(['apacheadmissiondx','first']).size().reset_index(name='count')
frequency_table_1 = frequency_table_1.sort_values(by='count', ascending=False)
print(frequency_table_1)

                                      apacheadmissiondx  \
318                       CHF, congestive heart failure   
1612                                  Sepsis, pulmonary   
327                CVA, cerebrovascular accident/stroke   
1119                  Infarction, acute myocardial (MI)   
1621              Sepsis, renal/UTI (including bladder)   
...                                                 ...   
81                                     Addisons disease   
199                                        Appendectomy   
83        Adrenal neoplasm (including pheochromocytoma)   
831                                   GI surgery, other   
582   Complications of previous open-heart surgery, ...   

                        first  count  
318        Cardiovascular (R)  11868  
1612       Cardiovascular (R)  10688  
327        Cardiovascular (R)   7596  
1119       Cardiovascular (R)   7308  
1621       Cardiovascular (R)   6442  
...                       ...    ...  
81     Infectious 

In [21]:
frequency_table_2 =df_past_dx.groupby(['apacheadmissiondx','second']).size().reset_index(name='count')
frequency_table_2 = frequency_table_2.sort_values(by='count', ascending=False)
print(frequency_table_2)

                                      apacheadmissiondx  \
5327                                  Sepsis, pulmonary   
1056               CVA, cerebrovascular accident/stroke   
1007                      CHF, congestive heart failure   
998                       CHF, congestive heart failure   
3597                  Infarction, acute myocardial (MI)   
...                                                 ...   
1867  Defibrillator, automatic implantable cardiac; ...   
1869  Defibrillator, automatic implantable cardiac; ...   
1870  Defibrillator, automatic implantable cardiac; ...   
4590                                  Pneumonia, fungal   
1831  Complications of previous open heart surgery (...   

                                second  count  
5327  Hypertension Requiring Treatment   3941  
1056  Hypertension Requiring Treatment   3595  
1007  Hypertension Requiring Treatment   3101  
998           Congestive Heart Failure   2856  
3597  Hypertension Requiring Treatment   2841  
...

In [25]:
from bigtree import Node, list_to_tree, tree_to_dot, dataframe_to_tree, tree_to_pillow

for disease in frequency1.index.get_level_values('first'):
    df_ = df1[df1['first'] == disease]
    # df_.set_index(['patientunitstayid', 'pasthistoryoffset'], inplace = True)
    
    paths_mult_ = df_.groupby('pasthistorypath', dropna = True)['pasthistorypath'].size().to_frame('noAnnotations')
    paths_mult_.reset_index(inplace=True)
    paths_mult_.sort_values('noAnnotations', ascending = False, inplace = True)

    paths_ = list(pd.unique(df_['pasthistorypath']))
    root_ = dataframe_to_tree(paths_mult_, sep = '/')
    root_.show(attr_list=["noAnnotations"])
    pillow_image_ = tree_to_pillow(root_)
    pillow_image_.save(porcesseddir2  + str(disease) + "selected_pastHistoryTree.jpg")


 Cardiovascular (R)
├── Hypertension Requiring Treatment
│   └── hypertension requiring treatment [noAnnotations=56983]
├── Congestive Heart Failure
│   ├── CHF [noAnnotations=14251]
│   ├── CHF - severity unknown [noAnnotations=2524]
│   ├── CHF - class III [noAnnotations=570]
│   ├── CHF - class IV [noAnnotations=452]
│   ├── CHF - class II [noAnnotations=443]
│   └── CHF - class I [noAnnotations=233]
├── Arrhythmias
│   ├── atrial fibrillation - chronic [noAnnotations=10881]
│   ├── atrial fibrillation - intermittent [noAnnotations=3017]
│   ├── ventricular tachycardia [noAnnotations=602]
│   ├── SVT- other [noAnnotations=464]
│   ├── sick sinus syndrome [noAnnotations=438]
│   ├── ventricular fibrillation [noAnnotations=128]
│   ├── ventricular ectopy [noAnnotations=118]
│   └── MAT [noAnnotations=28]
├── Myocardial Infarction
│   ├── MI - date unknown [noAnnotations=5826]
│   ├── MI - remote [noAnnotations=1931]
│   ├── MI - within 6 months [noAnnotations=1029]
│   ├── MI - within