In [2]:
import numpy as np
import pandas as pd
import psycopg2
import getpass
# for configuring connection 
from configobj import ConfigObj
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *

In [3]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [4]:
from sqlalchemy import create_engine,text

con = create_engine(
    f'postgresql://{conn_info["sqluser"]}:{conn_info["sqlpass"]}@{conn_info["sqlhost"]}:{conn_info["sqlport"]}/{conn_info["dbname"]}',
    connect_args={'options': '-c search_path=eicu_crd'}
)

In [5]:
create_table_query = query_schema + """
drop materialized view if exists diagnoses cascade;
create materialized view diagnoses as
  -- for past medical history:
select l.patient,l.apacheadmissiondx, ph.pasthistorypath as diagnosisstring
from pasthistory as ph
inner join labels as l on l.patient = ph.patientunitstayid
where ph.pasthistoryoffset > 0
      and ph.pasthistorypath LIKE '%Organ Systems%'      
"""
with con.begin() as connection:
    connection.execute(text(create_table_query)) 
    
select_query = "SELECT * FROM diagnoses;"
df_dx = pd.read_sql_query(select_query, con)
print("there are {} patiets and {} records".format(df_dx["patient"].nunique(),df_dx["patient"].count()))

there are 93272 patiets and 482385 records


## process the diagnosis 

In [6]:
flat =df_dx[['patient','apacheadmissiondx']].drop_duplicates()
diagnoses = df_dx.copy()

In [7]:
diagnoses 

Unnamed: 0,patient,apacheadmissiondx,diagnosisstring
0,141168,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...
1,141168,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...
2,141168,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...
3,141168,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...
4,141168,"Rhythm disturbance (atrial, supraventricular)",notes/Progress Notes/Past History/Organ System...
...,...,...,...
482380,3353226,"Effusions, pleural",notes/Progress Notes/Past History/Organ System...
482381,3353226,"Effusions, pleural",notes/Progress Notes/Past History/Organ System...
482382,3353237,"Embolus, pulmonary",notes/Progress Notes/Past History/Organ System...
482383,3353254,"Bleeding, lower GI",notes/Progress Notes/Past History/Organ System...


In [8]:
query = query_schema + """
select distinct(patient)
from id
order by patient;
"""
id = pd.read_sql_query(query, con)

In [9]:
common_patients = set( diagnoses['patient'])& set(id['patient'])
diagnoses = diagnoses[diagnoses['patient'].isin(common_patients)]
print("there are {} patiets and {} records".format(diagnoses["patient"].nunique(),diagnoses["patient"].count()))

there are 3072 patiets and 24416 records


In [10]:
diagnoses=diagnoses.drop(columns=['apacheadmissiondx'])
diagnoses

Unnamed: 0,patient,diagnosisstring
23819,260132,notes/Progress Notes/Past History/Organ System...
24064,261021,notes/Progress Notes/Past History/Organ System...
24065,261021,notes/Progress Notes/Past History/Organ System...
24066,261021,notes/Progress Notes/Past History/Organ System...
24067,261021,notes/Progress Notes/Past History/Organ System...
...,...,...
471271,3247116,notes/Progress Notes/Past History/Organ System...
471272,3247116,notes/Progress Notes/Past History/Organ System...
471273,3247116,notes/Progress Notes/Past History/Organ System...
471274,3247116,notes/Progress Notes/Past History/Organ System...


In [11]:
df1 = diagnoses.copy()
df1['diagnosisstring'] = df1['diagnosisstring'].str.replace('notes/Progress Notes/Past History/Organ Systems/', ' ')
df1['diagnosisstring'] = df1['diagnosisstring'].str.replace('|', '/')
split = df1['diagnosisstring'].str.split('/')
df1['first'] = split.str[0]
df1['second'] = split.str[1]
df1['third'] = split.str[2]
df1['fourth'] = split.str[3]
df1

Unnamed: 0,patient,diagnosisstring,first,second,third,fourth
23819,260132,Endocrine (R)/Hypothyroidism/hypothyroidism,Endocrine (R),Hypothyroidism,hypothyroidism,
24064,261021,Cardiovascular (R)/Pacemaker/unknown pacer,Cardiovascular (R),Pacemaker,unknown pacer,
24065,261021,Pulmonary/Asthma/asthma,Pulmonary,Asthma,asthma,
24066,261021,Pulmonary/COPD/COPD - moderate,Pulmonary,COPD,COPD - moderate,
24067,261021,Pulmonary/Asthma/asthma,Pulmonary,Asthma,asthma,
...,...,...,...,...,...,...
471271,3247116,Pulmonary/Respiratory Failure/respiratory fai...,Pulmonary,Respiratory Failure,respiratory failure - within 5 years,
471272,3247116,Cardiovascular (R)/Hypertension Requiring Tre...,Cardiovascular (R),Hypertension Requiring Treatment,hypertension requiring treatment,
471273,3247116,Pulmonary/Home Oxygen/home oxygen,Pulmonary,Home Oxygen,home oxygen,
471274,3247116,Pulmonary/COPD/COPD - severe,Pulmonary,COPD,COPD - severe,


In [12]:
frequency1 = pd.DataFrame()
frequency1['noAnnotations'] = df1['first'].value_counts() 
frequency1['%'] = df1['first'].value_counts() / len(df1.index) * 100
frequency1

Unnamed: 0_level_0,noAnnotations,%
first,Unnamed: 1_level_1,Unnamed: 2_level_1
Cardiovascular (R),12341,50.544725
Endocrine (R),3258,13.343709
Pulmonary,2310,9.461009
Neurologic,1807,7.400885
Hematology,1803,7.384502
Renal (R),1350,5.529161
Gastrointestinal (R),1232,5.045872
Rheumatic,165,0.675786
Infectious Disease (R),150,0.614351


In [13]:
frequency2 = pd.DataFrame()
frequency2['noAnnotations'] = df1['second'].value_counts() 
frequency2['%'] = df1['second'].value_counts() / len(df1.index) * 100
frequency2

Unnamed: 0_level_0,noAnnotations,%
second,Unnamed: 1_level_1,Unnamed: 2_level_1
Hypertension Requiring Treatment,4771,19.540465
Oncology (R),1803,7.384502
Congestive Heart Failure,1341,5.4923
Non-Insulin Dependent Diabetes,1325,5.426769
Arrhythmias,1241,5.082733
COPD,1152,4.718218
Insulin Dependent Diabetes,1010,4.136632
Cirrhosis,920,3.768021
Valve disease,908,3.718873
Myocardial Infarction,894,3.661533


In [14]:
from bigtree import Node, list_to_tree, tree_to_dot, dataframe_to_tree, tree_to_pillow

for disease in frequency1.index.get_level_values('first'):
    df_ = df1[df1['first'] == disease]
    df_.set_index(['patient'], inplace = True)
    
    paths_mult_ = df_.groupby('diagnosisstring', dropna = True)['diagnosisstring'].size().to_frame('noAnnotations')
    paths_mult_.reset_index(inplace=True)
    paths_mult_.sort_values('noAnnotations', ascending = False, inplace = True)

    paths_ = list(pd.unique(df_['diagnosisstring']))
    root_ = dataframe_to_tree(paths_mult_, sep = '/')
    root_.show(attr_list=["noAnnotations"])
    pillow_image_ = tree_to_pillow(root_)

 Cardiovascular (R)
├── Hypertension Requiring Treatment
│   └── hypertension requiring treatment [noAnnotations=4771]
├── Congestive Heart Failure
│   ├── CHF [noAnnotations=962]
│   ├── CHF - severity unknown [noAnnotations=151]
│   ├── CHF - class II [noAnnotations=104]
│   ├── CHF - class III [noAnnotations=86]
│   ├── CHF - class IV [noAnnotations=29]
│   └── CHF - class I [noAnnotations=9]
├── Arrhythmias
│   ├── atrial fibrillation - chronic [noAnnotations=857]
│   ├── atrial fibrillation - intermittent [noAnnotations=221]
│   ├── ventricular tachycardia [noAnnotations=85]
│   ├── SVT- other [noAnnotations=27]
│   ├── sick sinus syndrome [noAnnotations=23]
│   ├── ventricular ectopy [noAnnotations=22]
│   ├── ventricular fibrillation [noAnnotations=5]
│   └── MAT [noAnnotations=1]
├── Peripheral Vascular Disease
│   └── peripheral vascular disease [noAnnotations=540]
├── Myocardial Infarction
│   ├── MI - date unknown [noAnnotations=420]
│   ├── MI - remote [noAnnotations=196]
│

In [15]:
df1['first'] = df1['first'].str.strip()
from tabulate import tabulate

diseases = ['Cardiovascular (R)', 'Endocrine (R)', 'Pulmonary', 'Neurologic', 'Hematology', 'Renal  (R)', 'Gastrointestinal (R)', 'Rheumatic', 'Infectious Disease (R)']

for disease in diseases:
 
    df_disease = df1[df1['first'] == disease].copy()

    freq = pd.DataFrame()
    freq['noAnnotations'] = df_disease['second'].value_counts()
    freq['%'] = round(df_disease['second'].value_counts() / len(df_disease.index) * 100, 2)

    table = tabulate(freq, headers="keys", tablefmt="psql", showindex=True)
    print(f"The frequency of {disease}:")
    print(table)
    print("------------------------------")

The frequency of Cardiovascular (R):
+----------------------------------+-----------------+-------+
| second                           |   noAnnotations |     % |
|----------------------------------+-----------------+-------|
| Hypertension Requiring Treatment |            4771 | 38.66 |
| Congestive Heart Failure         |            1341 | 10.87 |
| Arrhythmias                      |            1241 | 10.06 |
| Valve disease                    |             908 |  7.36 |
| Myocardial Infarction            |             894 |  7.24 |
| Procedural Coronary Intervention |             739 |  5.99 |
| Coronary Artery Bypass           |             580 |  4.7  |
| Peripheral Vascular Disease      |             540 |  4.38 |
| Pacemaker                        |             336 |  2.72 |
| Venous Thrombosis                |             278 |  2.25 |
| Angina                           |             215 |  1.74 |
| AICD                             |             203 |  1.64 |
| Pulmonary Emboli

In [16]:
df1['diagnosisstring'] = df1['diagnosisstring'].str.split('/').str[:2].str.join('/')
df1 = df1.drop(columns=['third','fourth'])
df1

Unnamed: 0,patient,diagnosisstring,first,second
23819,260132,Endocrine (R)/Hypothyroidism,Endocrine (R),Hypothyroidism
24064,261021,Cardiovascular (R)/Pacemaker,Cardiovascular (R),Pacemaker
24065,261021,Pulmonary/Asthma,Pulmonary,Asthma
24066,261021,Pulmonary/COPD,Pulmonary,COPD
24067,261021,Pulmonary/Asthma,Pulmonary,Asthma
...,...,...,...,...
471271,3247116,Pulmonary/Respiratory Failure,Pulmonary,Respiratory Failure
471272,3247116,Cardiovascular (R)/Hypertension Requiring Tre...,Cardiovascular (R),Hypertension Requiring Treatment
471273,3247116,Pulmonary/Home Oxygen,Pulmonary,Home Oxygen
471274,3247116,Pulmonary/COPD,Pulmonary,COPD


In [17]:
csv= '/home/mei/nas/docker/thesis/data/csv/'
df1.to_csv(csv+'diagnoses_trim.csv')

hdf= '/home/mei/nas/docker/thesis/data/hdf/'
df1.to_hdf(hdf+'final_diagnoses_trim.h5', key='df')

In [21]:
df1

Unnamed: 0,patient,diagnosisstring,first,second
23819,260132,Endocrine (R)/Hypothyroidism,Endocrine (R),Hypothyroidism
24064,261021,Cardiovascular (R)/Pacemaker,Cardiovascular (R),Pacemaker
24065,261021,Pulmonary/Asthma,Pulmonary,Asthma
24066,261021,Pulmonary/COPD,Pulmonary,COPD
24067,261021,Pulmonary/Asthma,Pulmonary,Asthma
...,...,...,...,...
471271,3247116,Pulmonary/Respiratory Failure,Pulmonary,Respiratory Failure
471272,3247116,Cardiovascular (R)/Hypertension Requiring Tre...,Cardiovascular (R),Hypertension Requiring Treatment
471273,3247116,Pulmonary/Home Oxygen,Pulmonary,Home Oxygen
471274,3247116,Pulmonary/COPD,Pulmonary,COPD
