In [34]:
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import numpy as np
# get data path from project directory
from pathlib import Path
import os
from os.path import join, split
import sys
from tqdm import tqdm
data_path = join(Path(os.getcwd()).parent.parent, 'data')
mimic_path = join(data_path, 'raw', 'mimic-iii-clinical-database-1.4')

In [2]:
files = [f for f in os.listdir(mimic_path) if f.endswith('.csv.gz')]
print(files)

['ADMISSIONS.csv.gz', 'CALLOUT.csv.gz', 'CAREGIVERS.csv.gz', 'CHARTEVENTS.csv.gz', 'CPTEVENTS.csv.gz', 'DATETIMEEVENTS.csv.gz', 'DIAGNOSES_ICD.csv.gz', 'DRGCODES.csv.gz', 'D_CPT.csv.gz', 'D_ICD_DIAGNOSES.csv.gz', 'D_ICD_PROCEDURES.csv.gz', 'D_ITEMS.csv.gz', 'D_LABITEMS.csv.gz', 'ICUSTAYS.csv.gz', 'INPUTEVENTS_CV.csv.gz', 'INPUTEVENTS_MV.csv.gz', 'LABEVENTS.csv.gz', 'MICROBIOLOGYEVENTS.csv.gz', 'NOTEEVENTS.csv.gz', 'OUTPUTEVENTS.csv.gz', 'PATIENTS.csv.gz', 'PRESCRIPTIONS.csv.gz', 'PROCEDUREEVENTS_MV.csv.gz', 'PROCEDURES_ICD.csv.gz', 'SERVICES.csv.gz', 'TRANSFERS.csv.gz']


In [42]:
def load_csv(path, columns_str=None, columns=None):
    if path.endswith('.gz'):
        compression = 'gzip'
    else:
        compression = None
    if not columns_str is None:
        all_columns = pl.read_csv(path, n_rows=1, low_memory=True).columns
        selected_columns = [c for c in all_columns for s in columns_str if columns_str in c]
        if not columns is None:
            columns = list(set(columns + selected_columns))
        else:
            columns = selected_columns
    df = pl.read_csv(path, columns=columns, low_memory=True, parse_dates=True)
    return df

In [43]:
def pandas_get_columns(path):
    if path.endswith('.gz'):
        compression = 'gzip'
    return pd.read_csv(path, nrows=1, compression=compression).columns

In [57]:
def get_dest_path_for_parquet(dest_dir, file_path):
    file_name = split(file_path)[1]
    if not dest_dir is None:
        dest_path = join(dest_dir, file_name.replace('.csv.gz', '.parquet.gz'))
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
    else:
        file_name = file_name.replace('.csv.gz', '.parquet.gz')
        mimic_dir = split(file_path)[0]
        mimic_dir_name = split(mimic_dir)[1]
        data_dir = split(split(mimic_dir)[0])[0]
        interim_mimic_dir = join(data_dir, 'interim', mimic_dir_name)
        if not os.path.exists(interim_mimic_dir):
            os.makedirs(interim_mimic_dir)
        dest_path = join(interim_mimic_dir, file_name)
    return dest_path

def convert_csv_to_parquet(mimic_path, file_name, dest_dir=None, test=False, nrows=None):
    """
        test: load only 1000 rows
        file_name: path of the csv file, with ending
        dest_dir: directory to store the parquet files, without ending
    """
    mimic_path = mimic_path# cfg.raw_data_path
    dest_dir = dest_dir # cfg.data_working_path
    file_path = join(mimic_path, file_name)
    columns = pandas_get_columns(file_path)
    dtype_dic = {column:"Int64" for column in columns if column.endswith('ID') and column!='FLUID'}
    if test:
        nrows = 300
    dest_path = get_dest_path_for_parquet(dest_dir, file_path)
    pd.read_csv(file_path, nrows=nrows, dtype=dtype_dic, parse_dates=True, compression='gzip').to_parquet(dest_path, compression='gzip', index=False)


_CV and _MV endings indicate the system used to record the data.\
D prefix is a dictionary table and provides definitions for clinical identifiers.
5 tables to track patients: Admissions, Patients, ICUstays, Services, Transfers

In [58]:
# group dfs
tracking_table_names = ['ADMISSIONS', 'PATIENTS', 'ICDUSTAYS', 'SERVICES', 'TRANSFERS']
tracking_sheet = [file for file in files for table in tracking_table_names if file.startswith(table)]
events_table_names = ['CALLOUT', 'CAREGIVERS', 'CHARTEVENTS', 'CPTEVENTS', 'DATETIMEEVENTS', 'DIAGNOSES_ICD', 'DRGCODES', 'INPUTEVENTS', 'LABEVENTS', 'NOTEEVENTS', 'OUTPUTEVENTS', 'PRESCRIPTIONS', 'PROCEDUREEVENTS', 'PROCEDURES_ICD']
events_sheet = [file for file in files for table in events_table_names if file.startswith(table)]
dictionary_sheet = [table for table in files if table.startswith('D_')]

In [59]:
tables = tracking_sheet + events_sheet + dictionary_sheet
i = 0
for table in tqdm(tables):
    convert_csv_to_parquet(mimic_path, table)

 25%|██▌       | 6/24 [00:18<00:47,  2.65s/it]

In [47]:
load_csv(join(mimic_path, 'D_LABITEMS.csv.gz'))

ROW_ID,ITEMID,LABEL,FLUID,CATEGORY,LOINC_CODE
i64,i64,str,str,str,str
546,51346,"""Blasts""","""Cerebrospinal ...","""Hematology""","""26447-3"""
547,51347,"""Eosinophils""","""Cerebrospinal ...","""Hematology""","""26451-5"""
548,51348,"""Hematocrit, CS...","""Cerebrospinal ...","""Hematology""","""30398-2"""
549,51349,"""Hypersegmented...","""Cerebrospinal ...","""Hematology""","""26506-6"""
550,51350,"""Immunophenotyp...","""Cerebrospinal ...","""Hematology""",
551,51351,"""Lymphs""","""Cerebrospinal ...","""Hematology""","""26479-6"""
552,51352,"""Macrophage""","""Cerebrospinal ...","""Hematology""","""30426-1"""
553,51353,"""Mesothelial ce...","""Cerebrospinal ...","""Hematology""","""30429-5"""
554,51354,"""Metamyelocytes...","""Cerebrospinal ...","""Hematology""","""30366-9"""
555,51355,"""Monocytes""","""Cerebrospinal ...","""Hematology""","""26486-1"""


In [36]:
df_adm = pd.read_csv(join(mimic_path, tracking_sheet[0]), compression='gzip')
df_adm

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,58594,98800,191113,2131-03-30 21:13:00,2131-04-02 15:02:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,ENGL,NOT SPECIFIED,SINGLE,WHITE,2131-03-30 19:44:00,2131-03-30 22:41:00,TRAUMA,0,1
58972,58595,98802,101071,2151-03-05 20:00:00,2151-03-06 09:10:00,2151-03-06 09:10:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Medicare,ENGL,CATHOLIC,WIDOWED,WHITE,2151-03-05 17:23:00,2151-03-05 21:06:00,SAH,1,1
58973,58596,98805,122631,2200-09-12 07:15:00,2200-09-20 12:08:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,,,RENAL CANCER/SDA,0,1
58974,58597,98813,170407,2128-11-11 02:29:00,2128-12-22 13:11:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,ENGL,CATHOLIC,MARRIED,WHITE,2128-11-10 23:48:00,2128-11-11 03:16:00,S/P FALL,0,0


In [24]:
df_adm = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'), compression='gzip')
print('length', len(df_adm))
print('unique patients', len(df_adm['SUBJECT_ID'].unique()))
df_adm.head()

length 58976
unique patients 46520


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [4]:
df_co = pd.read_csv(join(mimic_path, 'CALLOUT.csv.gz'), compression='gzip')
df_co.head()
print('length', len(df_co))
print('unique patients', len(df_co['SUBJECT_ID'].unique()))
df_co.head()

length 34499
unique patients 22871


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SUBMIT_WARDID,SUBMIT_CAREUNIT,CURR_WARDID,CURR_CAREUNIT,CALLOUT_WARDID,CALLOUT_SERVICE,REQUEST_TELE,...,CALLOUT_STATUS,CALLOUT_OUTCOME,DISCHARGE_WARDID,ACKNOWLEDGE_STATUS,CREATETIME,UPDATETIME,ACKNOWLEDGETIME,OUTCOMETIME,FIRSTRESERVATIONTIME,CURRENTRESERVATIONTIME
0,402,854,175684,52.0,,29.0,MICU,1,MED,0,...,Inactive,Discharged,29.0,Acknowledged,2146-10-05 13:16:55,2146-10-05 13:16:55,2146-10-05 13:24:00,2146-10-05 18:55:22,2146-10-05 15:27:44,
1,403,864,138624,15.0,,55.0,CSRU,55,CSURG,0,...,Inactive,Discharged,55.0,Acknowledged,2114-11-28 08:31:39,2114-11-28 09:42:08,2114-11-28 09:43:08,2114-11-28 12:10:02,,
2,404,864,138624,12.0,,55.0,CSRU,55,CSURG,1,...,Inactive,Discharged,55.0,Acknowledged,2114-11-30 10:24:25,2114-12-01 09:06:18,2114-12-01 12:26:05,2114-12-01 21:55:05,,
3,405,867,184298,7.0,,17.0,CCU,17,CCU,1,...,Inactive,Discharged,17.0,Acknowledged,2136-12-29 08:45:42,2136-12-29 10:17:16,2136-12-29 10:33:51,2136-12-29 18:10:02,,
4,157,306,167129,57.0,,3.0,SICU,44,NSURG,1,...,Inactive,Discharged,3.0,Acknowledged,2199-09-18 11:47:47,2199-09-18 11:47:47,2199-09-18 11:58:33,2199-09-18 15:10:02,,


In [8]:
df_co = ps.read_csv(join(mimic_path, 'CALLOUT.csv.gz'))
sys.getsizeof(df_co)

48

In [22]:
# not relevant for our analysis
df_cg = pd.read_csv(join(mimic_path, 'CAREGIVERS.csv.gz'), compression='gzip')

In [10]:
pd_df = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', nrows=50000)
#pd_df.to_parquet(join(mimic_path, 'CHARTEVENTS_sample.parquet'), index=False)

In [18]:
df_ps = pl.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'),  n_rows=5000, parse_dates=True)

In [19]:
df_ps

ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED
i64,i64,i64,i64,i64,datetime[μs],datetime[μs],i64,f64,f64,str,i64,i64,str,str
788,36,165660,241249,223834,2134-05-12 12:00:00,2134-05-12 13:56:00,17525,15.0,15.0,"""L/min""",0,0,,
789,36,165660,241249,223835,2134-05-12 12:00:00,2134-05-12 13:56:00,17525,100.0,100.0,,0,0,,
790,36,165660,241249,224328,2134-05-12 12:00:00,2134-05-12 12:18:00,20823,0.37,0.37,,0,0,,
791,36,165660,241249,224329,2134-05-12 12:00:00,2134-05-12 12:19:00,20823,6.0,6.0,"""min""",0,0,,
792,36,165660,241249,224330,2134-05-12 12:00:00,2134-05-12 12:19:00,20823,2.5,2.5,,0,0,,
793,36,165660,241249,224331,2134-05-12 12:00:00,2134-05-12 12:19:00,20823,0.0,0.0,"""ml/hr""",0,0,,
794,36,165660,241249,224332,2134-05-12 12:00:00,2134-05-12 14:44:00,17525,3.0,3.0,,0,0,,
795,36,165660,241249,224663,2134-05-12 12:00:00,2134-05-12 14:44:00,17525,8.0,8.0,,0,0,,
796,36,165660,241249,224665,2134-05-12 12:00:00,2134-05-12 14:44:00,17525,1.11,1.11,,0,0,,
797,36,165660,241249,220224,2134-05-12 12:35:00,2134-05-12 12:38:00,20889,58.0,58.0,"""mmHg""",1,0,,


In [17]:
df_ps['STOPPED'].value_counts()

STOPPED,counts
str,u32
,5000


In [16]:
df_ps['RESULTSTATUS'].value_counts()

RESULTSTATUS,counts
str,u32
,5000


In [29]:
sys.getsizeof(pd_df)

15393687

In [28]:
sys.getsizeof(df_ps)

48

In [20]:
# no information on type of event, only value and ID
df_ce = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', nrows=10000)
df_ce.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED
0,788,36,165660,241249,223834,2134-05-12 12:00:00,2134-05-12 13:56:00,17525,15.0,15.0,L/min,0,0,,
1,789,36,165660,241249,223835,2134-05-12 12:00:00,2134-05-12 13:56:00,17525,100.0,100.0,,0,0,,
2,790,36,165660,241249,224328,2134-05-12 12:00:00,2134-05-12 12:18:00,20823,0.37,0.37,,0,0,,
3,791,36,165660,241249,224329,2134-05-12 12:00:00,2134-05-12 12:19:00,20823,6.0,6.0,min,0,0,,
4,792,36,165660,241249,224330,2134-05-12 12:00:00,2134-05-12 12:19:00,20823,2.5,2.5,,0,0,,


General approach:
- createa dataframe which contains: event_name, timestamp, value, visit, age 
- separate dataframes into batches