In [3]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
import seaborn as sns
# for configuring connection 
from configobj import ConfigObj
import os

%matplotlib inline

In [4]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [5]:
from sqlalchemy import create_engine,text
con= create_engine('postgresql://eicu@192.168.60.144:6432/eicu')

In [10]:
datadir = '/home/mei/nas/docker/dataset/EICU/eicu-collaborative-research-database-2.0/'
porcesseddir2 = '/home/mei/nas/docker/processedData_2/'

## 筛选出病人来自 direct 和 emergency 并且 住icu时长为 1到14天的病人

In [5]:
create_table_patient = query_schema + """

DROP TABLE IF EXISTS patient_2 CASCADE;
CREATE TABLE patient_2 as
SELECT DISTINCT patientunitstayid, hospitaladmitOffset, gender, age, apacheadmissiondx, unitadmitsource, admissionweight,	dischargeweight, unitdischargeoffset, unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
    AND unitdischargeoffset >=1440 
    AND unitdischargeoffset <=20160
group by patientunitstayid, hospitaladmitOffset
order by patientunitstayid, hospitaladmitOffset
      
"""
# df_p = pd.read_sql_query(query, con)

In [7]:
# with con.begin() as connection:
#     connection.execute(text(create_table_patient))  # Use text() to wrap the raw SQL

patient_2 = "SELECT * FROM patient_2;"
df_patient = pd.read_sql_query(text(patient_2),con)

In [None]:
df_patient.head(n=20)

Unnamed: 0,patientunitstayid,hospitaladmitoffset,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
0,141168,0,Female,70,"Rhythm disturbance (atrial, supraventricular)",Direct Admit,84.3,85.8,3596,Death,Expired
1,141265,-1,Male,67,"CVA, cerebrovascular accident/stroke",Direct Admit,100.0,91.8,6068,Floor,Alive
2,141266,-18,Male,73,"Sepsis, renal/UTI (including bladder)",Emergency Department,120.4,112.9,1501,Floor,Alive
3,141276,-1,Female,59,"Arrest, respiratory (without cardiac arrest)",Direct Admit,156.6,156.6,1684,Home,Alive
4,141284,-15,Male,63,Anemia,Direct Admit,,88.5,2076,Floor,Alive
5,141288,-171,Female,61,"Sepsis, pulmonary",Emergency Department,,92.2,1631,Floor,Alive
6,141304,-3,Male,70,"Sepsis, pulmonary",Emergency Department,,68.0,6639,Floor,Alive
7,141329,-87,Male,50,"Infarction, acute myocardial (MI)",Direct Admit,79.0,79.3,2572,Floor,Alive
8,141360,-30,Male,48,"CVA, cerebrovascular accident/stroke",Emergency Department,,92.3,3109,Floor,Alive
9,141415,-347,Male,30,"Thrombosis, vascular (deep vein)",Direct Admit,87.1,86.7,4430,Floor,Alive


In [11]:
# df_patient.to_csv(porcesseddir2 + 'selected_patient.csv',  index=True)

In [12]:
n =df_patient['patientunitstayid'].nunique()
print("There are {} unique patientunitstayid patients from the unitadmitsource: emergency and direct with a stay length between 1 and 14 days.".format(n))

There are 68446 unique patientunitstayid patients from the unitadmitsource: emergency and direct with a stay length between 1 and 14 days.


In [13]:
df_p=df_patient.copy()


In [14]:

frequency_table =df_p.groupby(['unitdischargelocation', 'unitdischargestatus']).size().reset_index(name='count')
print(frequency_table)


       unitdischargelocation unitdischargestatus  count
0                                                     4
1                                          Alive     66
2           Acute Care/Floor               Alive   6725
3                      Death             Expired   3452
4                      Floor               Alive  33902
5                       Home               Alive   6909
6                        ICU               Alive    216
7               Nursing Home               Alive    135
8             Operating Room               Alive      4
9                      Other               Alive    530
10            Other External               Alive   1122
11            Other Hospital               Alive   1250
12                 Other ICU               Alive   1045
13          Other ICU (CABG)               Alive      3
14            Other Internal               Alive     98
15            Rehabilitation               Alive    219
16  Skilled Nursing Facility               Alive

- Home：恢复最好
- Floor、Rehabilitation、Acute Care/Floor：表示病情有所好转。
- Step-Down Unit (SDU)、Skilled Nursing Facility、Other：患者需要监护，情况相对中等。
- ICU、Operating Room、Other ICU，Other ICU (CABG)：病情较为严重。
- Death 最严重的情况。

## 统计 nan 值

In [12]:
missing_ratio = pd.DataFrame(df_p.isna().sum() / len(df_p.index), columns = ['missing ratio %']) * 100
missing_ratio.sort_values('missing ratio %', inplace = True, ascending = False) 
missing_ratio

Unnamed: 0,missing ratio %
dischargeweight,38.985478
admissionweight,2.023493
patientunitstayid,0.0
hospitaladmitoffset,0.0
gender,0.0
age,0.0
apacheadmissiondx,0.0
unitadmitsource,0.0
unitdischargeoffset,0.0
unitdischargelocation,0.0


In [18]:
freq_l = pd.DataFrame()
freq_l['noAnnotations'] = df_p['unitdischargelocation'].value_counts() 
freq_l['%'] = df_p['unitdischargelocation'].value_counts() / len(df_p.index) * 100
freq_l

Unnamed: 0_level_0,noAnnotations,%
unitdischargelocation,Unnamed: 1_level_1,Unnamed: 2_level_1
Floor,33902,49.531017
Step-Down Unit (SDU),7617,11.128481
Home,6909,10.094089
Acute Care/Floor,6725,9.825264
Telemetry,4325,6.31885
Death,3452,5.043392
Other Hospital,1250,1.826257
Other External,1122,1.639248
Other ICU,1045,1.526751
Skilled Nursing Facility,824,1.203869


In [13]:
freq_s = pd.DataFrame()
freq_s['noAnnotations'] = df_p['unitdischargestatus'].value_counts() 
freq_s['%'] = df_p['unitdischargestatus'].value_counts() / len(df_p.index) * 100
freq_s

Unnamed: 0_level_0,noAnnotations,%
unitdischargestatus,Unnamed: 1_level_1,Unnamed: 2_level_1
Alive,64990,94.950764
Expired,3452,5.043392
,4,0.005844


## analysis the relation between unitdischargelocation', 'unitdischargeoffset','unitdischargestatus'

In [17]:
cols=[ 'patientunitstayid', 'unitdischargeoffset','unitdischargelocation']
df_p1=df_p[cols]

In [18]:
df_p1.head()

Unnamed: 0,patientunitstayid,unitdischargeoffset,unitdischargelocation
0,141168,3596,Death
1,141265,6068,Floor
2,141266,1501,Floor
3,141276,1684,Home
4,141284,2076,Floor
