In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
import seaborn as sns
# for configuring connection 
from configobj import ConfigObj
import os

%matplotlib inline

In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = 'localhost'
    conn_info["sqlport"] = 5432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [3]:
from sqlalchemy import create_engine
con= create_engine('postgresql://eicu@localhost:5432/eicu')

In [4]:
def round_up(x, base=5):
    return base * round(x/base)

In [5]:
query = query_schema + """
select *
from admissiondrug
order by drugoffset
"""

df_addrug = pd.read_sql_query(query, con)
n_drug = df_addrug['patientunitstayid'].nunique()
print("There are {} patients in the admission drug record.".format(n_drug))

There are 40740 patients in the admission drug record.


In [8]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select  adrug.admissiondrugid ,adrug.drugoffset,adrug.drugname, adrug.drugdosage, adrug.drugunit, adrug.drugadmitfrequency, sp.*
from admissiondrug adrug
join selected_patients sp
  on adrug.patientunitstayid = sp.patientunitstayid
 """
df_addrug_selected = pd.read_sql_query(query, con)

In [9]:
df_addrug_selected.head(n=10)

Unnamed: 0,admissiondrugid,drugoffset,drugname,drugdosage,drugunit,drugadmitfrequency,patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,dischargeweight,unitdischargelocation,unitdischargestatus
0,1589228,64,ELIQUIS ...,0.0,,,242954,Male,84,"Hemorrhage/hematoma, intracranial",Direct Admit,106.3,106.8,Floor,Alive
1,2063596,29,VICTOZA 2-PAK ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
2,2063592,29,METFORMIN HCL ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
3,2063589,29,LIPITOR ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
4,2063595,29,POTASSIUM CHLORIDE ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
5,2063591,29,FLECAINIDE ACETATE ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
6,2063588,29,ASPIRIN ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
7,2063594,29,OMEPRAZOLE ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
8,2063590,29,CITALOPRAM HBR ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
9,2063593,29,NAPROXEN ...,0.0,,,243285,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive


In [10]:
# df_addrug_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_admissionDrug.csv')

In [19]:
df_addrug_selected = pd.read_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_admissionDrug.csv')

In [20]:

n_drug_selected= len(df_addrug_selected['drugname'])
print("There are {} selected patients in the admission drug table with total {} records.".format(df_addrug_selected['patientunitstayid'].nunique(),n_drug_selected))

There are 23518 selected patients in the admission drug table with total 490189 records.


In [39]:
df=df_addrug_selected.copy()
df= df.set_index(['patientunitstayid', 'drugoffset'])
df= df.drop(df.index[np.where(df.index.get_level_values('drugoffset') < 0)])
df.sort_index(level = ['patientunitstayid', 'drugoffset'], inplace = True)
df= df[['drugname', 'drugdosage', 'drugunit','drugadmitfrequency', 'gender','age','apacheadmissiondx','unitadmitsource','admissionweight',	'dischargeweight','unitdischargelocation',	'unitdischargestatus']]
# Resample every 5 mins
df.rename(round_up, level = 'drugoffset', inplace = True)
df['drugname'] = df['drugname'].str.split(' ').str[0].str.lower()

In [40]:
df.head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,drugname,drugdosage,drugunit,drugadmitfrequency,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,dischargeweight,unitdischargelocation,unitdischargestatus
patientunitstayid,drugoffset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
242954,65,eliquis,0.0,,,Male,84,"Hemorrhage/hematoma, intracranial",Direct Admit,106.3,106.8,Floor,Alive
243285,30,victoza,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
243285,30,metformin,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
243285,30,lipitor,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
243285,30,potassium,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
243285,30,flecainide,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
243285,30,aspirin,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
243285,30,omeprazole,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
243285,30,citalopram,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive
243285,30,naproxen,0.0,,,Male,67,Diabetic ketoacidosis,Emergency Department,96.7,98.8,Floor,Alive


In [41]:
freq_addrug = pd.DataFrame()
freq_addrug['noAnnotations'] = df['drugname'].value_counts() 
freq_addrug['%'] = df['drugname'].value_counts() / len(df.index) * 100

freq_addrug

Unnamed: 0_level_0,noAnnotations,%
drugname,Unnamed: 1_level_1,Unnamed: 2_level_1
aspirin,14503,3.181159
lisinopril,9676,2.122381
vitamin,8341,1.829556
lasix,7801,1.711110
albuterol,7763,1.702774
...,...,...
goniosoft,1,0.000219
oregano,1,0.000219
"multivits,stress",1,0.000219
bioflavonoid,1,0.000219
