In [2]:
import numpy as np
import pandas as pd
import itertools
import gc
import psycopg2
import getpass
# for configuring connection 
from configobj import ConfigObj
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *

In [3]:
# Create a database connection using settings from config file
config='../db/config.ini'

# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = ''
    conn_info["sqlhost"] = '192.168.60.144'
    conn_info["sqlport"] = 6432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu_crd'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == '192.168.60.144') & (conn_info["sqlport"]=='6432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                   host=conn_info["sqlhost"],
                                   port=conn_info["sqlport"],
                                   user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')

        con = psycopg2.connect(dbname=conn_info["dbname"],
                               host=conn_info["sqlhost"],
                               port=conn_info["sqlport"],
                               user=conn_info["sqluser"],
                               password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres


In [4]:
from sqlalchemy import create_engine,text

con = create_engine(
    f'postgresql://{conn_info["sqluser"]}:{conn_info["sqlpass"]}@{conn_info["sqlhost"]}:{conn_info["sqlport"]}/{conn_info["dbname"]}',
    connect_args={'options': '-c search_path=eicu_crd'}
)

In [4]:
create_table_query =  query_schema +"""
drop materialized view if exists timeserieslab cascade;
create materialized view timeserieslab as
  select l.patientunitstayid as patient, l.labresultoffset, l.labname,l.labresult
    from lab as l
    inner join id 
      on id.patient = l.patientunitstayid -- only extract data for the cohort
    where l.labresultoffset > 0;

"""

with con.begin() as connection:
    connection.execute(text(create_table_query)) 

select_query = "SELECT * FROM timeserieslab;"
df_lab = pd.read_sql_query(select_query, con)

print("there are {} patiets and {} records in lab test table".format(df_lab["patient"].nunique(),df_lab["patient"].count()))

there are 3072 patiets and 702430 records in lab test table


In [5]:
timeseries_lab = df_lab.copy()
timeseries_lab = timeseries_lab[timeseries_lab['labresultoffset'] <= 11*24*60]
timeseries_lab.set_index(['patient','labresultoffset'], inplace=True)
timeseries_lab.rename(round_up, level = 'labresultoffset', inplace = True)
timeseries_lab.sort_index(inplace=True)

In [6]:
lab = process_vital_signs(timeseries_lab,'labresultoffset',lab=True)

==> Reconfiguring lab test timeseries...
==> Filtering out invalid values with the possible value ranges...
There are 3072 patients and 112493 records in the vital periodic table.
==> Filtering out outliers...
select valid vlaue of vital signs
There are 3072 patients and 112493 records in the vital periodic table.
==> Normalizing data...


In [35]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'
vital = pd.read_hdf(hdf + 'vital_6_8.h5', key='vital_6_8')
l = lab.copy()
l.columns = l.columns.droplevel(0)
## get common patients
common_patients = list(set(l.index.get_level_values(0)) & set(vital.index.get_level_values(0)))
print("there are {} common patients".format(len(common_patients)))

l = l.loc[l.index.get_level_values(0).isin(common_patients)]
vital = vital.loc[vital.index.get_level_values(0).isin(common_patients)]
print("there are {} patients in vital and {} patients in lab".format(vital.index.get_level_values(0).nunique(),l.index.get_level_values(0).nunique()))
print("there are {} records in vital and {} records in lab".format(vital.shape[0],l.shape[0]))
print('==> Combining data together...')
merged =  pd.concat([l, vital], axis=0, sort=False)
print("there are {} patients in the merged table and {} records.".format(len(list(merged.index.get_level_values(0).unique())),len(merged)))

there are 3072 common patients
there are 3072 patients in vital and 3072 patients in lab
there are 2101463 records in vital and 112493 records in lab
==> Combining data together...
there are 3072 patients in the merged table and 2213956 records.


In [38]:
patients = merged.index.unique(level=0)
gen_chunks = gen_patient_chunk(patients, merged)
header = True
print('==> Initiating main processing loop...')

for i, patient_chunk in enumerate(gen_chunks, start=1):
    final = resample(patient_chunk)
    final.to_hdf(hdf + "final_timeseries.h5", key="df", mode="a", complevel=5, complib="zlib", format="table", append=True)

    print(f'==> Processed {i * 500} patients...')
    
    del patient_chunk # free up memory
    gc.collect()

==> Initiating main processing loop...
==> Processed 500 patients...
==> Processed 1000 patients...
==> Processed 1500 patients...
==> Processed 2000 patients...
==> Processed 2500 patients...
==> Processed 3000 patients...
==> Processed 3500 patients...


In [3]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'
final_ts= pd.read_hdf(hdf+ "final_timeseries.h5", key="df") # key is the name of the key in the hdf file
final_ts

Unnamed: 0_level_0,Unnamed: 1_level_0,-bands,-basos,-eos,-lymphs,-monos,-polys,24 h urine protein,24 h urine urea nitrogen,ALT (SGPT),ANF/ANA,...,sao2,heartrate,respiration,cvp,systemicsystolic,systemicdiastolic,systemicmean,st1,st2,st3
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
260132,1,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,0.5,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
260132,2,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,0.5,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
260132,3,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,0.5,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
260132,4,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,0.5,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
260132,5,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,0.5,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3247116,2328,0.5,0.0,0.000000,0.023529,0.142857,0.500000,0.5,0.5,0.01897,0.5,...,0.890286,0.602807,0.443774,0.854000,0.581317,0.396000,0.463358,0.5,0.500000,0.5
3247116,2329,0.5,0.0,0.000000,0.023529,0.142857,0.500000,0.5,0.5,0.01897,0.5,...,0.890476,0.601974,0.444182,0.854167,0.581587,0.397348,0.464416,0.5,0.500000,0.5
3247116,2330,0.5,0.0,0.000000,0.023529,0.142857,0.500000,0.5,0.5,0.01897,0.5,...,0.889441,0.601449,0.432322,0.854348,0.576412,0.394466,0.459854,0.5,0.500000,0.5
3247116,2331,0.5,0.0,0.000000,0.023529,0.142857,0.500000,0.5,0.5,0.01897,0.5,...,0.888312,0.600080,0.433962,0.852273,0.570495,0.391322,0.455209,0.5,0.500000,0.5


In [4]:
##  the max time step
print("the max time step is {}".format(final_ts.index.get_level_values(1).max()))

the max time step is 3169


In [5]:
final_ts.columns
# save the feature names
features = final_ts.columns
features.to_series().to_csv('/home/mei/nas/docker/thesis/data/hdf/features.txt', index=False, header=False)

In [None]:
with open('/home/mei/nas/docker/thesis/data/hdf/features.txt', 'r') as f:
    features = [line.strip() for line in f]
features

In [None]:
ts =final_ts.copy()
ts = ts.reset_index()
ts = ts[['patient','time']]
ts

In [10]:
with con.begin() as conn:
    conn.execute(text("DROP MATERIALIZED VIEW IF EXISTS id CASCADE;"))
    conn.execute(text("DROP TABLE IF EXISTS temp_id CASCADE;"))

ts.to_sql("temp_id", con, if_exists="replace", index=True, method="multi")

with con.begin() as connection:
    connection.execute(text("DROP TABLE IF EXISTS id CASCADE;"))
    connection.execute(text("""
                            
        CREATE MATERIALIZED VIEW id AS
        SELECT patient, time
        FROM temp_id;
        
    """))
    

In [12]:
select_query = "SELECT patient FROM id;"
df_id = pd.read_sql_query(select_query, con)
print("there are {} patiets and {} records".format(df_id["patient"].nunique(),df_id["patient"].count()))

there are 3072 patiets and 4486431 records
