In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
import seaborn as sns
# for configuring connection 
from configobj import ConfigObj
import os

%matplotlib inline

In [None]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = 'localhost'
    conn_info["sqlport"] = 5432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

In [None]:
from sqlalchemy import create_engine
con= create_engine('postgresql://eicu@localhost:5432/eicu')

In [None]:
def round_up(x, base=5):
    return base * round(x/base)

In [None]:
query = query_schema + """
select  *
from patient
"""
df_total = pd.read_sql_query(query, con)

n_patient_icu = df_total['patientunitstayid'].nunique()
print("There are {} patients in the icu record.".format(n_patient_icu))


In [None]:
query = query_schema + """
with frequency_count as (
select  uniquepid as uniquep, count(*) as frequency
from patient
group by uniquep
)
select 
    CASE 
        when frequency > 4 then 'Greater than 4'
        else cast(frequency as TEXT)
    END as frequency_category,  count(*) as occurrence
from     frequency_count
group by frequency_category
order by occurrence DESC
"""
df_unique = pd.read_sql_query(query, con)
# df_unique['patientunitstayid'].nunique()

df_unique['data occurrence'] = df_unique['occurrence'] / df_total['uniquepid'].nunique() * 100.0
df_unique.sort_values('data occurrence', ascending=False, inplace=True)
df_unique.head()

In [None]:
query = query_schema + """

SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
 """

df_selected_patients = pd.read_sql_query(query, con)
# df_selected_patients.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients.csv', index=False)
# print("Data exported successfully to 'selected_patients.csv'.")
df_selected_patients.head()

In [None]:
n_selected_patients=df_selected_patients['patientunitstayid'].nunique()
print("There are {} unique patientunitstayid in the selected patients with the selected unitadmitsource: emergency and direct.".format(n_selected_patients))

In [None]:
df_selected_patients = pd.read_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients.csv')

In [None]:
query = query_schema + """
select *
from admissiondrug
order by drugoffset
"""

df_addrug = pd.read_sql_query(query, con)
n_drug = df_addrug['patientunitstayid'].nunique()
print("There are {} patients in the admission drug record.".format(n_drug))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select  adrug.admissiondrugid ,adrug.drugname, adrug.drugdosage, adrug.drugunit, adrug.drugadmitfrequency, sp.*
from admissiondrug adrug
join selected_patients sp
  on adrug.patientunitstayid = sp.patientunitstayid
 """

df_addrug_selected = pd.read_sql_query(query, con)

In [None]:
df_addrug_selected.head(n=10)

In [None]:
# df_addrug_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_admissionDrug.csv')

In [None]:

n_drug_selected= len(df_addrug_selected['drugname'])
print("There are {} selected patients in the admission drug table with total {} records.".format(df_addrug_selected['patientunitstayid'].nunique(),n_drug_selected))

In [None]:
df_addrug_selected = pd.read_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_admissionDrug.csv')

In [None]:
df_addrug_selected = df_addrug_selected.set_index(['patientunitstayid', 'admissiondrugid'])
df_addrug_selected.sort_index(level = ['patientunitstayid', 'admissiondrugid'], inplace = True)
df_addrug_selected = df_addrug_selected[['drugname', 'drugdosage', 'drugunit','drugadmitfrequency', 'gender','age','apacheadmissiondx','unitadmitsource','admissionweight',	'dischargeweight','unitdischargelocation',	'unitdischargestatus']]
# Resample every 5 mins
# df_addrug_selected.rename(round_up, level = 'admissiondrugid', inplace = True)
df_addrug_selected['drugname'] = df_addrug_selected['drugname'].str.split(' ').str[0].str.lower()


In [None]:
df_addrug_selected.head(n=10)

In [None]:
freq_addrug = pd.DataFrame()
freq_addrug['noAnnotations'] = df_addrug_selected['drugname'].value_counts() 
freq_addrug['%'] = df_addrug_selected['drugname'].value_counts() / len(df_addrug_selected.index) * 100

freq_addrug

In [None]:
query = query_schema + """
select *
from admissiondx
"""

df_addx = pd.read_sql_query(query, con)
n_dx = df_addx['patientunitstayid'].nunique()
print("There are {} icu patients in the admission diagnosis record.".format(n_dx))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select addx.admissiondxid, addx.admitdxpath	,addx.admitdxname, addx.admitdxtext,sp.*
from admissiondx addx
join selected_patients sp
  on addx.patientunitstayid = sp.patientunitstayid
 """

df_addx_selected = pd.read_sql_query(query, con)

In [None]:
df_addx_selected.head()

In [None]:
# df_addx_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_admissionDx.csv')

In [None]:
n_dx_selected= len(df_addx_selected['admissiondxid'])
print("There are {} selected patients in the admission drug table with total {} records.".format(df_addx_selected['patientunitstayid'].nunique(),n_dx_selected))

In [None]:
df_addx_selected = pd.read_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_admissionDx.csv')

In [None]:
#  data to be grouped by patients (patientunitstayid) and their specific diagnosis IDs (admissiondxid),
df_addx_selected  = df_addx_selected .set_index(['patientunitstayid', 'admissiondxid'])
df_addx_selected .sort_index(level = ['patientunitstayid', 'admissiondxid'], inplace = True)
df_addx_selected  = df_addx_selected [['admitdxpath', 'admitdxname', 'admitdxtext','gender','age','apacheadmissiondx','unitadmitsource','admissionweight',	'dischargeweight','unitdischargelocation',	'unitdischargestatus']]
# Resample every 5 mins
# df_addrug_selected.rename(round_up, level = 'admissiondrugid', inplace = True)
df_addx_selected['admitdxpath'] = df_addx_selected['admitdxpath'].str.replace('|', '/')

In [None]:
df_addx_selected.head(n=10)

In [None]:
from bigtree import Node, list_to_tree, tree_to_dot, dataframe_to_tree, tree_to_pillow

paths_mult_ = df_addx_selected.groupby('admitdxpath', dropna = True)['admitdxpath'].size().to_frame('noAnnotations')
paths_mult_.reset_index(inplace=True)
paths_mult_.sort_values('noAnnotations', ascending = False, inplace = True)

paths_ = list(pd.unique(df_addx_selected['admitdxpath']))
root_ = dataframe_to_tree(paths_mult_, sep = '/')
root_.show(attr_list=["noAnnotations"])

In [None]:
img = tree_to_pillow(root_)
img.show()  

In [None]:
dot_representation = tree_to_dot(root_)
dot_representation.show()
print(dot_representation) 

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(df['Annotations'], labels=df['Diagnosis'], autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Blues', len(df)))
plt.title('Distribution of Top-Level Diagnoses')
plt.show()

In [None]:
query = query_schema + """
select *
from diagnosis

"""

df_addx = pd.read_sql_query(query, con)
n_dx = df_addx['patientunitstayid'].nunique()
print("There are {} icu patients in the diagnosis record.".format(n_dx))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select dx.diagnosisid	, dx.activeupondischarge	,dx.diagnosisstring, dx.diagnosispriority,sp.*
from diagnosis dx
join selected_patients sp
  on dx.patientunitstayid = sp.patientunitstayid
 """

df_diagnosis_selected = pd.read_sql_query(query, con)
df_diagnosis_selected.head()

In [None]:
# df_diagnosis_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_diagnosis.csv')

In [None]:
n_diagnosis_selected= len(df_diagnosis_selected['diagnosisid'])
print("There are {} selected patients in the diagnosis table with total {} records.".format(df_diagnosis_selected['patientunitstayid'].nunique(),n_diagnosis_selected))

In [None]:
query = query_schema + """
select *
from lab
"""

df_lab = pd.read_sql_query(query, con)
n_lab = df_lab['patientunitstayid'].nunique()
print("There are {} icu patients in the lab  record.".format(n_lab))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select l.labid, l.labtypeid	,l.labname, l.labresult,l.labmeasurenamesystem, sp.*
from lab l
join selected_patients sp
  on l.patientunitstayid = sp.patientunitstayid
 """

df_lab_selected = pd.read_sql_query(query, con)
df_lab_selected.head()

In [None]:
# df_lab_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_lab.csv')

In [None]:
n_lab_selected= len(df_lab_selected['labid'])
print("There are {} selected patients in the lab table with total {} records.".format(df_lab_selected['patientunitstayid'].nunique(),n_lab_selected))

In [None]:
query = query_schema + """
select *
from medication
"""

df_med = pd.read_sql_query(query, con)
n_med = df_lab['patientunitstayid'].nunique()
print("There are {} icu patients in the medication record.".format(n_med))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select med.medicationid,med.drugname, med.dosage, med.frequency, med.prn, med.drugstopoffset,sp.*
from medication med
join selected_patients sp
  on med.patientunitstayid = sp.patientunitstayid
 """

df_med_selected = pd.read_sql_query(query, con)
df_med_selected.head()

In [None]:
# df_med_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_med.csv')

In [None]:
n_med_selected= len(df_med_selected['medicationid'])
print("There are {} selected patients in the medication table with total {} records.".format(df_med_selected['patientunitstayid'].nunique(),n_med_selected))

In [None]:
query = query_schema + """
select *
from nursecharting
order by nursingchartoffset
"""

df_nursecharting = pd.read_sql_query(query, con)
n_nursecharting = df_nursecharting['patientunitstayid'].nunique()
print("There are {} icu patients in the nursecharting record.".format(n_nursecharting))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select nc.nursingchartid, nc.nursingchartcelltypecat, nc.nursingchartcelltypevallabel, nc.nursingchartcelltypevalname, nc.nursingchartvalue, sp.*
from nursecharting nc
join selected_patients sp
  on nc.patientunitstayid = sp.patientunitstayid
 """

df_nchart_selected = pd.read_sql_query(query, con)
df_nchart_selected.head()

In [None]:
query = query_schema + """
select *
from pasthistory
order by pasthistoryenteredoffset
"""

df_past = pd.read_sql_query(query, con)
n_past = df_past['patientunitstayid'].nunique()
print("There are {} icu patients in the pasthistary record.".format(n_past))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select ph.pasthistoryid, ph.pasthistorynotetype,ph.pasthistorypath,ph.pasthistoryvalue,	ph.pasthistoryvaluetext,sp.*
from pasthistory ph
join selected_patients sp
  on ph.patientunitstayid = sp.patientunitstayid
 """

df_past_selected = pd.read_sql_query(query, con)
df_past_selected.head()

In [None]:
# df_past_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_pasthistory.csv')

In [None]:
n_past_selected= len(df_past_selected['pasthistoryid'])
print("There are {} selected patients in the past history table with total {} records.".format(df_past_selected['patientunitstayid'].nunique(),n_past_selected))

In [None]:
query = query_schema + """
select *
from treatment
order by treatmentoffset	
"""

df_treatment = pd.read_sql_query(query, con)
n_tr = df_treatment['patientunitstayid'].nunique()
print("There are {} icu patients in the treatment record.".format(n_tr))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select tr.treatmentid, tr.treatmentstring,	tr.activeupondischarge,sp.*
from treatment tr
join selected_patients sp
  on tr.patientunitstayid = sp.patientunitstayid
 """

df_treatment_selected = pd.read_sql_query(query, con)
df_treatment_selected.head()

In [None]:
# df_treatment_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_treatment.csv')

In [None]:
n_treatment_selected= len(df_treatment_selected['treatmentid'])
print("There are {} selected patients in the treatment table with total {} records.".format(df_past_selected['patientunitstayid'].nunique(),n_treatment_selected))

In [None]:
query = query_schema + """
select *
from vitalAperiodic
order by observationoffset
"""

df_aperiodic = pd.read_sql_query(query, con)
n_vap = df_treatment['patientunitstayid'].nunique()
print("There are {} icu patients in the vitalAperiodic record.".format(n_vap))

In [None]:
query = query_schema + """

with selected_patients as (
SELECT DISTINCT patientunitstayid,gender,age,apacheadmissiondx,unitadmitsource,admissionweight,	dischargeweight,unitdischargelocation,	unitdischargestatus
FROM patient
WHERE unitadmitsource IN ('Emergency Department', 'Direct Admit') 
)
select vap.vitalaperiodicid,vap.observationoffset,vap.noninvasivesystolic,	vap.noninvasivediastolic,	vap.noninvasivemean,	vap.paop,	vap.cardiacoutput,	vap.cardiacinput,	vap.svr,	vap.svri,	vap.pvr,	vap.pvri,sp.*
from vitalaperiodic vap
join selected_patients sp
  on vap.patientunitstayid = sp.patientunitstayid
 """

df_aperiodic_selected = pd.read_sql_query(query, con)
df_aperiodic_selected.head()

In [None]:
# df_aperiodic_selected.to_csv('/Users/meisun/Documents/study/thesis/master project/data-preprocessing/selected_patients_aperiodic.csv')

In [None]:
n_aperiodic_selected= len(df_aperiodic_selected['vitalaperiodicid'])
print("There are {} selected patients in the vital aoeriodic table with total {} records.".format(df_past_selected['patientunitstayid'].nunique(),n_aperiodic_selected))

In [None]:
query = query_schema + """
select *
from vitalperiodic
order by observationoffset
"""

df_aperiodic = pd.read_sql_query(query, con)
n_tr = df_treatment['patientunitstayid'].nunique()
print("There are {} icu patients in the vital periodic record.".format(n_tr))