In [1]:
import numpy as np
import pandas as pd
import psycopg2
import getpass
# for configuring connection 
from configobj import ConfigObj
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *

In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [3]:
from sqlalchemy import create_engine,text

con = create_engine(
    f'postgresql://{conn_info["sqluser"]}:{conn_info["sqlpass"]}@{conn_info["sqlhost"]}:{conn_info["sqlport"]}/{conn_info["dbname"]}',
    connect_args={'options': '-c search_path=eicu_crd'}
)

In [18]:
create_table_query = query_schema + """
drop materialized view if exists diagnoses cascade;
create materialized view diagnoses as
  -- for past medical history:
select l.patient,l.apacheadmissiondx, ph.pasthistorypath as diagnosisstring
from pasthistory as ph
inner join labels as l on l.patient = ph.patientunitstayid
where ph.pasthistoryoffset > 0
      and ph.pasthistorypath LIKE '%Organ Systems%'      
"""
with con.begin() as connection:
    connection.execute(text(create_table_query)) 
    
select_query = "SELECT * FROM diagnoses;"
df_dx = pd.read_sql_query(select_query, con)
print("there are {} patiets and {} records".format(df_dx["patient"].nunique(),df_dx["patient"].count()))

there are 93272 patiets and 482385 records


## process the diagnosis 

In [19]:
flat =df_dx[['patient','apacheadmissiondx']].drop_duplicates()
diagnoses = df_dx.copy().set_index('patient')

In [20]:

unique_diagnoses = diagnoses['diagnosisstring'].unique()

codes_dict, mapping_dict, count, words_dict = build_mapping_dict(unique_diagnoses)
patients = diagnoses.index.unique()
sparse_diagnoses = np.zeros((len(patients), count))
patient_to_index = {patient: idx for idx, patient in enumerate(patients)}

for patient, diag_list in diagnoses.groupby('patient')['diagnosisstring']:
    diag_list = diag_list.tolist()  # 转换为普通 Python 列表
    codes = [code for diag in diag_list for code in mapping_dict.get(diag, [])]  # 确保 mapping_dict 有效
    codes = [code for code in codes if 0 <= code < count]  # 确保编码在合法范围内
    if patient in patient_to_index:  # 确保 patient 存在于索引映射中
        sparse_diagnoses[patient_to_index[patient], codes] = 1

# Create Pandas DataFrame
sparse_df = pd.DataFrame(sparse_diagnoses, index=patients, columns=range(count))
print(f'Sparse DataFrame shape: {sparse_df.shape}') 

cutoff_prevalence= 0.01 # 1%
print('==> Filtering codes...')
sparse_df.drop(columns=find_unnecessary_codes(codes_dict) + find_rare_codes(round(cutoff_prevalence * len(patients)), sparse_df), inplace=True) # cutoff_prevalence =
sparse_df.rename(columns=words_dict, inplace=True)
print('==> Adding admission diagnoses from flat...')
sparse_df = add_admission_diagnoses(sparse_df, flat, round(cutoff_prevalence * len(patients)))
print(f'Sparse DataFrame shape: {sparse_df.shape}')
print(f'==> Keeping {sparse_df.shape[1]} diagnoses with prevalence > {cutoff_prevalence * 100:.2f}%...')

Sparse DataFrame shape: (93272, 344)
==> Filtering codes...
==> Adding admission diagnoses from flat...
Sparse DataFrame shape: (93272, 119)
==> Keeping 119 diagnoses with prevalence > 1.00%...


In [21]:
csv= '/home/mei/nas/docker/thesis/data/csv/'
sparse_df.to_csv(csv+'diagnoses.csv')

In [None]:
print('==> Loading data diagnoses.csv...')
diagnoses = pd.read_csv(csv + 'diagnoses.csv')
diagnoses.set_index('patient', inplace=True)
diagnoses

==> Loading data diagnoses.csv...


Unnamed: 0_level_0,Cardiovascular (R),Cardiovascular (R)|AICD,Cardiovascular (R)|Angina,Cardiovascular (R)|Arrhythmias,Cardiovascular (R)|Arrhythmias|atrial fibrillation - chronic,Cardiovascular (R)|Arrhythmias|atrial fibrillation - intermittent,Cardiovascular (R)|Congestive Heart Failure,Cardiovascular (R)|Congestive Heart Failure|CHF,Cardiovascular (R)|Congestive Heart Failure|CHF - severity unknown,Cardiovascular (R)|Coronary Artery Bypass,...,apacheadmissiondx_Rhythm disturbance (conduction defect),apacheadmissiondx_Seizures (primary-no structural brain disease),"apacheadmissiondx_Sepsis, GI","apacheadmissiondx_Sepsis, cutaneous/soft tissue","apacheadmissiondx_Sepsis, pulmonary","apacheadmissiondx_Sepsis, renal/UTI (including bladder)","apacheadmissiondx_Sepsis, unknown",grouped_apacheadmissiondx_GI,"grouped_apacheadmissiondx_Overdose,","grouped_apacheadmissiondx_Pneumonia,"
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141168,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
141194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
141203,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
141260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
141265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3353216,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3353226,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3353237,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
query = query_schema + """
select distinct(patient)
from id
order by patient;
"""
id = pd.read_sql_query(query, con)


In [6]:
id

Unnamed: 0,patient
0,260132
1,261021
2,270853
3,282833
4,306940
...,...
3067,3246409
3068,3246443
3069,3246620
3070,3246731


In [11]:
csv= '/home/mei/nas/docker/thesis/data/csv/'
print('==> Loading data diagnoses.csv...')
diagnoses = pd.read_csv(csv + 'diagnoses.csv')
common_patients = set( diagnoses['patient'])& set(id['patient'])
diagnoses = diagnoses[diagnoses['patient'].isin(common_patients)]
print(f'==> Keeping {diagnoses.shape[0]} patients with admission diagnoses...')

==> Loading data diagnoses.csv...
==> Keeping 3072 patients with admission diagnoses...


In [13]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'
diagnoses.to_hdf(hdf+'final_diagnoses.h5', key='df')

In [15]:
diagnoses = pd.read_hdf(hdf+'final_diagnoses.h5', key='df')
diagnoses.set_index('patient', inplace=True)
diagnoses

Unnamed: 0_level_0,Cardiovascular (R),Cardiovascular (R)|AICD,Cardiovascular (R)|Angina,Cardiovascular (R)|Arrhythmias,Cardiovascular (R)|Arrhythmias|atrial fibrillation - chronic,Cardiovascular (R)|Arrhythmias|atrial fibrillation - intermittent,Cardiovascular (R)|Congestive Heart Failure,Cardiovascular (R)|Congestive Heart Failure|CHF,Cardiovascular (R)|Congestive Heart Failure|CHF - severity unknown,Cardiovascular (R)|Coronary Artery Bypass,...,apacheadmissiondx_Rhythm disturbance (conduction defect),apacheadmissiondx_Seizures (primary-no structural brain disease),"apacheadmissiondx_Sepsis, GI","apacheadmissiondx_Sepsis, cutaneous/soft tissue","apacheadmissiondx_Sepsis, pulmonary","apacheadmissiondx_Sepsis, renal/UTI (including bladder)","apacheadmissiondx_Sepsis, unknown",grouped_apacheadmissiondx_GI,"grouped_apacheadmissiondx_Overdose,","grouped_apacheadmissiondx_Pneumonia,"
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
261021,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
270853,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
282833,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
306940,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3246409,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3246443,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3246620,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3246731,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
