In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
import seaborn as sns
# for configuring connection 
from configobj import ConfigObj
import os

%matplotlib inline

In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [3]:
from sqlalchemy import create_engine,text
con= create_engine('postgresql://eicu@192.168.60.144:6432/eicu')

In [4]:
datadir = '/home/mei/nas/docker/dataset/EICU/eicu-collaborative-research-database-2.0/'
porcesseddir2 = '/home/mei/nas/docker/processedData_2/'

In [5]:
def round_up(x, base=5):
    return base * round(x/base)

In [6]:
query = query_schema + """
select p.patientunitstayid, p.hospitaladmitOffset,a.drugoffset,p.gender, p.age, 
      p.apacheadmissiondx,a.drugname,p.unitadmitsource, p.admissionweight,	
      p.dischargeweight, p.unitdischargeoffset, p.unitdischargelocation,	
      p.unitdischargestatus
from admissiondrug a
join patient_2 p
  on a.patientunitstayid = p.patientunitstayid
  
order by p.patientunitstayid, p.hospitaladmitOffset,a.drugoffset
"""

In [7]:
df_drug_p = pd.read_sql_query(query,con)

In [8]:
df_drug_p.head(n=10)

Unnamed: 0,patientunitstayid,hospitaladmitoffset,drugoffset,gender,age,apacheadmissiondx,drugname,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
0,252784,0,99,Male,56,Diabetic ketoacidosis,NOVOLOG ...,Emergency Department,75.0,75.7,2952,Floor,Alive
1,252784,0,99,Male,56,Diabetic ketoacidosis,FLAX SEED OIL ...,Emergency Department,75.0,75.7,2952,Floor,Alive
2,252784,0,99,Male,56,Diabetic ketoacidosis,LISINOPRIL ...,Emergency Department,75.0,75.7,2952,Floor,Alive
3,252784,0,99,Male,56,Diabetic ketoacidosis,MULTIVITAMIN ...,Emergency Department,75.0,75.7,2952,Floor,Alive
4,252784,0,99,Male,56,Diabetic ketoacidosis,ASPIRIN ...,Emergency Department,75.0,75.7,2952,Floor,Alive
5,252784,0,99,Male,56,Diabetic ketoacidosis,OMEPRAZOLE ...,Emergency Department,75.0,75.7,2952,Floor,Alive
6,252784,0,99,Male,56,Diabetic ketoacidosis,OMEGA 3 ...,Emergency Department,75.0,75.7,2952,Floor,Alive
7,252784,0,99,Male,56,Diabetic ketoacidosis,LANTUS ...,Emergency Department,75.0,75.7,2952,Floor,Alive
8,252784,0,99,Male,56,Diabetic ketoacidosis,CARVEDILOL ...,Emergency Department,75.0,75.7,2952,Floor,Alive
9,252784,0,99,Male,56,Diabetic ketoacidosis,ATORVASTATIN CALCIUM ...,Emergency Department,75.0,75.7,2952,Floor,Alive


In [9]:
df= df_drug_p.copy()
df = df.set_index(['patientunitstayid', 'hospitaladmitoffset','drugoffset'])
df = df.drop(df.index[np.where(df.index.get_level_values('drugoffset') < 0)])
df.sort_index(level = ['patientunitstayid','hospitaladmitoffset', 'drugoffset'], inplace = True)
# Resample every 5 mins
df.rename(round_up, level = 'drugoffset', inplace = True)
df['drugname'] = df['drugname'].str.split(' ').str[0].str.lower()

In [10]:
df.head(n=20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,age,apacheadmissiondx,drugname,unitadmitsource,admissionweight,dischargeweight,unitdischargeoffset,unitdischargelocation,unitdischargestatus
patientunitstayid,hospitaladmitoffset,drugoffset,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
252784,0,100,Male,56,Diabetic ketoacidosis,novolog,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,flax,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,lisinopril,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,multivitamin,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,aspirin,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,omeprazole,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,omega,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,lantus,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,carvedilol,Emergency Department,75.0,75.7,2952,Floor,Alive
252784,0,100,Male,56,Diabetic ketoacidosis,atorvastatin,Emergency Department,75.0,75.7,2952,Floor,Alive


In [11]:
# df.to_csv(porcesseddir2 + 'selected_patient_drugname.csv',  index=True)

In [None]:
n=len(list(df.index.get_level_values('patientunitstayid').unique()))
r=len(df)
print("There are {} unique patientunitstayid in the selected patients with the total {} records.".format(n,r))

There are 15789 unique patientunitstayid in the selected patients with the total 325226 records.


## frequency of drug 

In [13]:
freq_addrug = pd.DataFrame()
freq_addrug['noAnnotations'] = df['drugname'].value_counts() 
freq_addrug['%'] = df['drugname'].value_counts() / len(df.index) * 100
# freq_addrug.to_csv(porcesseddir2 + 'selected_patient_drugname_freq.csv',  index=True)

In [6]:
freq_addrug= pd.read_csv(porcesseddir2 + 'selected_patient_drugname_freq.csv')

In [8]:
print(freq_addrug)

        drugname  noAnnotations         %
0        aspirin          10705  3.291557
1     lisinopril           6917  2.126829
2          lasix           5968  1.835032
3        vitamin           5883  1.808896
4     metoprolol           5205  1.600426
...          ...            ...       ...
2518    immulife              1  0.000307
2519    multigen              1  0.000307
2520    solosite              1  0.000307
2521  hemorrhoid              1  0.000307
2522        tart              1  0.000307

[2523 rows x 3 columns]


In [19]:
df_drug=df.copy()
frequency_table =df_drug.groupby(['apacheadmissiondx','drugname']).size().reset_index(name='count')
frequency_table = frequency_table.sort_values(by='count', ascending=False)

print(frequency_table)

                          apacheadmissiondx       drugname  count
30757                     Sepsis, pulmonary        aspirin    808
5705          CHF, congestive heart failure        aspirin    628
31224                     Sepsis, pulmonary          lasix    620
6524   CVA, cerebrovascular accident/stroke        aspirin    616
6050          CHF, congestive heart failure          lasix    605
...                                     ...            ...    ...
13490               Encephalopathy, hepatic  acetaminophen      1
13489  Encephalopathies (excluding hepatic)         zyrtec      1
13486  Encephalopathies (excluding hepatic)    zafirlukast      1
13483  Encephalopathies (excluding hepatic)     wellbutrin      1
17548                Hepatic failure, acute          zyvox      1

[35097 rows x 3 columns]


In [20]:
# frequency_table.to_csv(porcesseddir2 + 'selected_patient_drugname_addx_freq.csv',  index=False)

In [9]:
frequency_table=pd.read_csv(porcesseddir2 + 'selected_patient_drugname_addx_freq.csv')

In [11]:
frequency_table.head(n=20)

Unnamed: 0,apacheadmissiondx,drugname,count
0,"Sepsis, pulmonary",aspirin,808
1,"CHF, congestive heart failure",aspirin,628
2,"Sepsis, pulmonary",lasix,620
3,"CVA, cerebrovascular accident/stroke",aspirin,616
4,"CHF, congestive heart failure",lasix,605
5,"Sepsis, pulmonary",albuterol,579
6,Diabetic ketoacidosis,lantus,561
7,Emphysema/bronchitis,albuterol,508
8,"Sepsis, renal/UTI (including bladder)",aspirin,477
9,"Sepsis, renal/UTI (including bladder)",vitamin,454


## 统计 nan 值

In [23]:
missing_ratio = pd.DataFrame(df.isna().sum() / len(df.index), columns = ['missing ratio %']) * 100
missing_ratio.sort_values('missing ratio %', inplace = True, ascending = False) 
missing_ratio

Unnamed: 0,missing ratio %
dischargeweight,36.762743
admissionweight,0.432622
gender,0.0
age,0.0
apacheadmissiondx,0.0
drugname,0.0
unitadmitsource,0.0
unitdischargeoffset,0.0
unitdischargelocation,0.0
unitdischargestatus,0.0
