In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
import seaborn as sns
# for configuring connection 
from configobj import ConfigObj
import os

%matplotlib inline

In [3]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [4]:
from sqlalchemy import create_engine,text
con= create_engine('postgresql://eicu@192.168.60.144:6432/eicu')

In [5]:
datadir = '/home/mei/nas/docker/dataset/EICU/eicu-collaborative-research-database-2.0/'
porcesseddir2 = '/home/mei/nas/docker/processedData_2/'

In [6]:
def round_up(x, base=5):
    return base * round(x/base)

In [9]:
query = query_schema + """
select p.patientunitstayid, p.hospitaladmitOffset,ph.pasthistoryoffset,p.gender, p.age, 
      p.apacheadmissiondx,ph.pasthistorypath,p.unitadmitsource, p.admissionweight,	
      p.dischargeweight, p.unitdischargeoffset, p.unitdischargelocation,	
      p.unitdischargestatus
from pasthistory ph
join patient_2 p
  on ph.patientunitstayid = p.patientunitstayid
  
order by p.patientunitstayid, p.hospitaladmitOffset,ph.pasthistoryoffset
"""

In [10]:
df_past= pd.read_sql_query(query,con)
df_past.head(n=20)

Unnamed: 0,patientunitstayid,hospitaladmitoffset,pasthistoryoffset,gender,age,apacheadmissiondx,pasthistorypath,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
0,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
1,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
2,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
3,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
4,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Past History...,Direct Admit,84.3,85.8,3596,Death,Expired
5,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
6,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
7,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
8,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
9,141168,0,114,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired


In [14]:
df=df_past.copy()
# Set indices
df = df.set_index(['patientunitstayid', 'hospitaladmitoffset','pasthistoryoffset'])
df = df.drop(df.index[np.where(df.index.get_level_values('pasthistoryoffset') < 0)])
df.sort_index(level = ['patientunitstayid','hospitaladmitoffset', 'pasthistoryoffset'], inplace = True)
# Resample every 5 mins
df.rename(round_up, level = 'pasthistoryoffset', inplace = True)
df.head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,age,apacheadmissiondx,pasthistorypath,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
patientunitstayid,hospitaladmitoffset,pasthistoryoffset,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Past History...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,115,Female,70,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...,Direct Admit,84.3,85.8,3596,Death,Expired


In [18]:
df1 = df[df['pasthistorypath'].str.contains('Organ Systems')]
df1 = df1.reset_index()
df1['pasthistorypath'] = df1['pasthistorypath'].str.replace('notes/Progress Notes/Past History/Organ Systems/', ' ')
df1['pasthistorypath'] = df1['pasthistorypath'].str.replace('|', '/')
split = df1['pasthistorypath'].str.split('/')
df1['first'] = split.str[0]
df1['second'] = split.str[1]
df1['third'] = split.str[2]
df1['fourth'] = split.str[3]

df1.head()

Unnamed: 0,patientunitstayid,hospitaladmitoffset,pasthistoryoffset,gender,age,apacheadmissiondx,pasthistorypath,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus,first,second,third,fourth
0,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Pulmonary/COPD/COPD - no limitations,Direct Admit,84.3,85.8,3596,Death,Expired,Pulmonary,COPD,COPD - no limitations,
1,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Valve disease/AS,Direct Admit,84.3,85.8,3596,Death,Expired,Cardiovascular (R),Valve disease,AS,
2,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Congestive Heart Failure/C...,Direct Admit,84.3,85.8,3596,Death,Expired,Cardiovascular (R),Congestive Heart Failure,CHF - class II,
3,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Hypertension Requiring Tre...,Direct Admit,84.3,85.8,3596,Death,Expired,Cardiovascular (R),Hypertension Requiring Treatment,hypertension requiring treatment,
4,141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/AICD/AICD,Direct Admit,84.3,85.8,3596,Death,Expired,Cardiovascular (R),AICD,AICD,


In [19]:
columns = df1.columns.tolist()
index = columns.index('pasthistorypath')
new_order = columns[:index + 1] + ['first', 'second', 'third', 'fourth'] + columns[index + 1:-4]
df1 = df1.reindex(columns=new_order)
df1 = df1.set_index(['patientunitstayid', 'hospitaladmitoffset','pasthistoryoffset'])
# df1.sort_index(level = ['patientunitstayid','hospitaladmitoffset', 'pasthistoryoffset'], inplace = True)
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,age,apacheadmissiondx,pasthistorypath,first,second,third,fourth,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
patientunitstayid,hospitaladmitoffset,pasthistoryoffset,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Pulmonary/COPD/COPD - no limitations,Pulmonary,COPD,COPD - no limitations,,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Valve disease/AS,Cardiovascular (R),Valve disease,AS,,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Congestive Heart Failure/C...,Cardiovascular (R),Congestive Heart Failure,CHF - class II,,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/Hypertension Requiring Tre...,Cardiovascular (R),Hypertension Requiring Treatment,hypertension requiring treatment,,Direct Admit,84.3,85.8,3596,Death,Expired
141168,0,60,Female,70,"Rhythm disturbance (atrial, supraventricular)",Cardiovascular (R)/AICD/AICD,Cardiovascular (R),AICD,AICD,,Direct Admit,84.3,85.8,3596,Death,Expired


In [21]:
# df1.to_csv(porcesseddir2 + 'selected_pastHistory.csv',  index=True)

In [22]:
n=len(list(df1.index.get_level_values('patientunitstayid').unique()))
r=len(df1)
print("There are {} unique patientunitstayid in the selected patients with the total {} records.".format(n,r))

There are 53612 unique patientunitstayid in the selected patients with the total 312103 records.


## frequency of past dx history 

## nan 值统计

In [20]:
missing_ratio = pd.DataFrame(df1.isna().sum() / len(df1.index), columns = ['missing ratio %']) * 100
missing_ratio.sort_values('missing ratio %', inplace = True, ascending = False) 
missing_ratio

Unnamed: 0,missing ratio %
fourth,89.413431
dischargeweight,34.968264
admissionweight,1.186788
gender,0.0
age,0.0
apacheadmissiondx,0.0
pasthistorypath,0.0
first,0.0
second,0.0
third,0.0
