In [1]:
import numpy as np
import pandas as pd
import psycopg2
import getpass
# for configuring connection 
from configobj import ConfigObj
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *

In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [3]:
from sqlalchemy import create_engine,text

con = create_engine(
    f'postgresql://{conn_info["sqluser"]}:{conn_info["sqlpass"]}@{conn_info["sqlhost"]}:{conn_info["sqlport"]}/{conn_info["dbname"]}',
    connect_args={'options': '-c search_path=eicu_crd'}
)

In [10]:
create_table_query = query_schema +"""
drop materialized view if exists drug cascade;
create materialized view drug as
select d.patient, a.drugname
from admissiondrug a
inner join diagnoses d on a.patientunitstayid = d.patient
where a.drugoffset > 0
order by patient;
"""

with con.begin() as connection:  
    connection.execute(text(create_table_query))
    
select_query = "SELECT * FROM drug;"
df_drug = pd.read_sql_query(select_query, con)
print("there are {} patiets and {} records".format(df_drug["patient"].nunique(),df_drug["patient"].count()))

there are 23861 patiets and 10521716 records


In [11]:
drug = df_drug.copy()

In [12]:
freq_addrug = pd.DataFrame()
freq_addrug['noAnnotations'] = drug['drugname'].value_counts() 
freq_addrug['%'] = drug['drugname'].value_counts() / len(drug.index) * 100
freq_addrug

Unnamed: 0_level_0,noAnnotations,%
drugname,Unnamed: 1_level_1,Unnamed: 2_level_1
ASPIRIN,369649,3.513201
LISINOPRIL,215076,2.044115
LASIX,179966,1.710424
OMEPRAZOLE,162931,1.548521
COLACE,141692,1.346662
...,...,...
PRENATAL RX,1,0.000010
ALDOMET,1,0.000010
HYDROCODONE BT-HOMATROPINE MBR,1,0.000010
CATHETER,1,0.000010


## select top 100 drugs

In [13]:
top100_drugs = drug['drugname'].value_counts().head(100).index
drug= drug[drug['drugname'].isin(top100_drugs)]
print('==> Top 100 drugs selected.')
print("there are {} patients in the top 100 drugs and {} records.".format(len(list(drug['patient'].unique())),len(drug)))

one_hot = pd.get_dummies(drug['drugname'])
drug=drug.join(one_hot)
drug=drug.drop(columns=['drugname'])
drug = drug.astype(int)
drug=drug.groupby('patient').max()

==> Top 100 drugs selected.
there are 22911 patients in the top 100 drugs and 6462250 records.


In [14]:
drug.reset_index(inplace=True)

In [15]:
import re

def clean_column(col):
    col = col.strip()
    col = re.sub(r'[^0-9a-zA-Z]+', '_', col)
    return col.lower()

sql_columns = [clean_column(c) for c in drug.columns]
drug.columns = sql_columns

In [16]:
with con.begin() as conn:
    conn.execute(text("DROP MATERIALIZED VIEW IF EXISTS drug CASCADE;"))
    conn.execute(text("DROP TABLE IF EXISTS temp_drug CASCADE;"))
 
columns = ','.join([f'{col}' for col in sql_columns])
drug.to_sql("temp_drug", con, if_exists="replace", index=True, method="multi")   

create_table_sql = f"""
DROP TABLE IF EXISTS drug;
create materialized view drug as
SELECT 
    {columns}									
FROM temp_drug;
"""
with con.begin() as connection:
    connection.execute(text(create_table_sql))


In [17]:
df_drug = pd.read_sql_query("SELECT * FROM drug;", con)

In [23]:
csv= '/home/mei/nas/docker/thesis/data/csv/'
df_drug.to_csv(csv+'drug.csv',index=False)

In [24]:
print('==> Loading data drug.csv...')
drug = pd.read_csv(csv + 'drug.csv')

==> Loading data drug.csv...


In [25]:
drug

Unnamed: 0,patient,acetaminophen,advair_diskus,albuterol,albuterol_sulfate,allopurinol,alprazolam,amiodarone_hcl,amlodipine_besilate,aspir_81,...,tramadol_hcl,trazodone_hcl,tylenol,vancomycin,ventolin_hfa,vicodin,vitamin_c,vitamin_d,warfarin_sodium,zofran
0,243285,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,245345,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,247068,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,247069,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,252784,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22906,3247421,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22907,3335525,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22908,3346588,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
22909,3347496,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
query = query_schema + """
select  distinct on (d.patient) d.*
from drug d 
inner join id on id.patient = d.patient
order by d.patient;
"""
df_drug = pd.read_sql_query(query, con)
print("there are {} patiets and {} records".format(df_drug["patient"].nunique(),df_drug["patient"].count()))

there are 3072 patiets and 3072 records


In [None]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'
df_drug.to_hdf(hdf+'final_drug.h5', key='df')

In [9]:
drug = pd.read_hdf(hdf+'final_drug.h5', key='df')
flat = pd.read_hdf(hdf+'final_flat.h5', key='df')
flat_drug = pd.merge(flat, drug, on='patient', how='left')
flat_drug
flat_drug.to_hdf(hdf+'final_flat_drug.h5', key='df')