In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import numpy as np
# get data path from project directory
from pathlib import Path
import os
from os.path import join, split
import sys
from tqdm import tqdm
data_path = join(Path(os.getcwd()).parent.parent, 'data')
mimic_path = join(data_path, 'raw', 'mimic-iii-clinical-database-1.4')
extr_path = join(data_path, 'interim', 'mimic_iii_ml_for_health')
processed_path = join(data_path, 'formatted', 'mimic-iii-clinical-database-1.4')
import pyarrow as pa
from ehr_preprocess.preprocessors.mimic import MIMIC3Preprocessor

In [3]:
# load procedureevents_mv
proc_mv = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', chunksize=100000)
# proc_mv.LOCATIONCATEGORY.value_counts()


In [19]:
for i, d in enumerate(proc_mv):
    print(len(d))

1000
1000
300


In [7]:
# get size in gb
proc_mv.memory_usage(index=True).sum() / 1024**3

0.048068657517433167

In [None]:
chart = pd.read_csv(join(mimic_path, 'INPUTEVENTS_CV.csv.gz'), compression='gzip', nrows=1000)

In [None]:
chart.columns[7]

In [None]:
inmv = pd.read_csv(join(mimic_path, 'INPUTEVENTS_MV.csv.gz'),compression='gzip', nrows=1000,)
inmv.head()

In [None]:
df = pd.read_parquet(join(processed_path, 'concept.med.parquet'))


In [None]:
df

In [None]:
df[df.TIMESTAMP.isnull()]

In [None]:
df.TIMESTAMP.isnull().sum()

In [None]:
items_dic = pd.read_csv(join(mimic_path, 'D_ITEMS.csv.gz'), usecols=['ITEMID', 'LABEL'])
input = pd.read_csv(join(mimic_path, 'MICROBIOLOGYEVENTS.csv.gz'), nrows=1000).rename(columns={'SPEC_ITEMID': 'ITEMID'}, inplace=False)
input = pd.merge(input, items_dic, on='ITEMID', how='left')
input

In [None]:
items_dic = pd.read_csv(join(mimic_path, 'D_ITEMS.csv.gz'), nrows=10000)
# extract labels and itemids as dict
items_dic = items_dic[['LABEL', 'ITEMID']].set_index('ITEMID').to_dict()['LABEL']
def get_outputevents():
    events = pd.read_csv(join(mimic_path, 'OUTPUTEVENTS.csv.gz'), nrows=5000,
        usecols=['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'ITEMID', 'VALUE', 'VALUEOM'])
    events['CONCEPT'] = events.ITEMID.map(items_dic)
    return events
events = get_outputevents()
events

In [None]:
items = pd.read_csv(join(mimic_path, 'D_ITEMS.csv.gz'), nrows=10000)
items[items.LINKSTO=='outputevents']

In [None]:
events = pd.read_csv(join(mimic_path, 'OUTPUTEVENTS.csv.gz'), nrows=5000)
events.head()


In [None]:
df_org = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'),  compression='gzip',
    parse_dates=['ADMITTIME', 'DISCHTIME', 'EDREGTIME', 'EDOUTTIME',],)
df_org

In [None]:
df_org['adm_diff'] = (df_org['ADMITTIME'] - df_org['EDREGTIME']).dt.days
df_org.adm_diff.describe()
df_org['adm_diff'] = (df_org['DISCHTIME'] - df_org['EDOUTTIME']).dt.days
df_org.adm_diff.describe()

In [None]:
def convert_admission_discharge_to_events( df, start_col, end_col, concept_name):
        """Convert to events, store as start and end date"""
        discharge = df.copy(deep=True).drop(columns=[start_col])
        admission = df.rename(columns={start_col: 'TIMESTAMP'}).drop(columns=[end_col])
        admission.drop(columns=['DISCHARGE_LOCATION'], inplace=True)
        discharge.drop(columns=['ADMISSION_LOCATION'], inplace=True)
        if concept_name=='HOSPITAL':
            admission = admission.rename(columns={'ADMISSION_LOCATION': 'VALUE_CAT'})
            discharge = discharge.rename(columns={'DISCHARGE_LOCATION': 'VALUE_CAT'})
        admission['CONCEPT'] = f'T{concept_name}_ADMISSION'
        discharge = discharge.rename(columns={end_col: 'TIMESTAMP'})
        discharge['CONCEPT'] = f'T{concept_name}_DISCHARGE'
        df = pd.concat([admission, discharge], axis=0)
        return df

In [None]:
df_icu = pd.read_csv(join(mimic_path, 'ICUSTAYS.csv.gz'), compression='gzip',nrows=10000, 
                parse_dates=['INTIME', 'OUTTIME'])
df_icu

In [None]:
# load OUTPUTEVENTS
df = pd.read_csv(join(mimic_path, 'OUTPUTEVENTS.csv.gz'), nrows=10000,)
df

In [None]:
df.columns

In [None]:
df = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', nrows=10000)
df

In [None]:
# create increasing number where every third number is skipped using range
# and list comprehension
x = np.array([i for i in range(1, 10000) if i % 10 != 0])
print(x)


In [None]:
# load Items dic
items_dic = pd.read_csv(join(mimic_path, 'D_ITEMS.csv.gz'), nrows=10000)
items_dic

In [1]:
# load inputevents cv
df = pd.read_csv(join(mimic_path, 'PROCEDUREEVENTS.csv.gz'), nrows=1000)
df.columns

NameError: name 'pd' is not defined

In [None]:
df = pd.read_csv(join(mimic_path, 'INPUTEVENTS_CV.csv.gz'), compression='gzip',
    usecols=['ITEMID'], nrows=int(1e6), dtype={'ITEMID': 'Int32'}, skiprows=np.array([i for i in range(1, int(1e6)) if i % 100 != 0]))
df_item_dic= pd.read_csv(join(mimic_path, 'D_ITEMS.csv.gz'), compression='gzip',)
df_m = pd.merge(df, df_item_dic[['ITEMID', 'LABEL']], on='ITEMID', how='left')
df_m.LABEL.value_counts()

# We will only process tables with data not contained in the mimic preprocessor

In [None]:
dfi = pd.read_hdf(join(extr_path, "all_hourly_data.h5"), 
    key='interventions')
dfi

In [None]:
dfvl = pd.read_hdf(join(extr_path, "all_hourly_data.h5"), 
    key='vitals_labs_mean')
dfvl

In [None]:
dfp = pd.read_hdf(join(extr_path, "all_hourly_data.h5"),
    key='patients')
dfp

## DRG codes

In [None]:
df_drg = pd.read_csv(join(mimic_path, 'DRGCODES.csv.gz'),  compression='gzip')
df_drg

Maybe relevant, check overlap with ICD codes

## Procedures

In [None]:
df_pro = pd.read_csv(join(mimic_path, 'PROCEDURES_ICD.csv.gz'),  compression='gzip')
df_pro

Time of the procedures not specified

In [None]:
df_ce = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', nrows=50000)
print('RESULTSTATUS',df_ce.RESULTSTATUS.unique(), 'STOPPED', df_ce.STOPPED.unique(), 'WARNING',df_ce.WARNING.unique())
print('STORETIME can be dropped, because its the time of entering the data into the database, CHARTTIME is the time of the measurement')
print('CGID can be dropped as we don\'t care about the care giver')
print('VALUENUM can be dropped, because we have the VALUE column which containes the full information')
print('We can mask out on the ERROR column and drop the ERROR column')
print('ROW_ID does not carry any information')

In [None]:
all_columns = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', nrows=1).columns
drop_columns = ['RESULTSTATUS', 'STOPPED','WARNING', 'STORETIME', 'CGID', 'VALUENUM', 'ROW_ID']
load_columns = [c for c in all_columns if not c in drop_columns]
df_ce = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', nrows=int(1e6), usecols=load_columns, parse_dates=['CHARTTIME'])
# df_ce = df_ce[df_ce['ERROR'] == 0]
# df_ce = df_ce.drop(columns=['ERROR'])
print(sys.getsizeof(df_ce)/1e6, 'MB')

In [None]:
def load_csv(path, columns_str=None, columns=None):
    if path.endswith('.gz'):
        compression = 'gzip'
    else:
        compression = None
    if not columns_str is None:
        all_columns = pl.read_csv(path, n_rows=1, low_memory=True).columns
        selected_columns = [c for c in all_columns for s in columns_str if columns_str in c]
        if not columns is None:
            columns = list(set(columns + selected_columns))
        else:
            columns = selected_columns
    df = pl.read_csv(path, columns=columns, low_memory=True, parse_dates=True)
    return df

In [None]:
def pandas_get_columns(path):
    if path.endswith('.gz'):
        compression = 'gzip'
    return pd.read_csv(path, nrows=1, compression=compression).columns

In [None]:
def get_dest_path_for_parquet(dest_dir, file_path):
    file_name = split(file_path)[1]
    if not dest_dir is None:
        dest_path = join(dest_dir, file_name.replace('.csv.gz', '.parquet.gz'))
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
    else:
        file_name = file_name.replace('.csv.gz', '.parquet.gz')
        mimic_dir = split(file_path)[0]
        mimic_dir_name = split(mimic_dir)[1]
        data_dir = split(split(mimic_dir)[0])[0]
        interim_mimic_dir = join(data_dir, 'interim', mimic_dir_name)
        if not os.path.exists(interim_mimic_dir):
            os.makedirs(interim_mimic_dir)
        dest_path = join(interim_mimic_dir, file_name)
    return dest_path

def convert_csv_to_parquet(mimic_path, file_name, dest_dir=None, test=False, nrows=None):
    """
        test: load only 1000 rows
        file_name: path of the csv file, with ending
        dest_dir: directory to store the parquet files, without ending
    """
    mimic_path = mimic_path# cfg.raw_data_path
    dest_dir = dest_dir # cfg.data_working_path
    file_path = join(mimic_path, file_name)
    columns = pandas_get_columns(file_path)
    dtype_dic = {column:"Int64" for column in columns if column.endswith('ID') and column!='FLUID'}
    if test:
        nrows = 300
    dest_path = get_dest_path_for_parquet(dest_dir, file_path)
    pd.read_csv(file_path, nrows=nrows, dtype=dtype_dic, parse_dates=True, compression='gzip').to_parquet(dest_path, compression='gzip', index=False,)


_CV and _MV endings indicate the system used to record the data.\
D prefix is a dictionary table and provides definitions for clinical identifiers.
5 tables to track patients: Admissions, Patients, ICUstays, Services, Transfers

In [None]:
# group dfs
files = os.listdir(mimic_path)
tracking_table_names = ['ADMISSIONS', 'PATIENTS', 'ICDUSTAYS', 'SERVICES', 'TRANSFERS']
tracking_sheet = [file for file in files for table in tracking_table_names if file.startswith(table)]
events_table_names = ['CALLOUT', 'CAREGIVERS', 'CHARTEVENTS', 'CPTEVENTS', 'DATETIMEEVENTS', 'DIAGNOSES_ICD', 'DRGCODES', 'INPUTEVENTS', 'LABEVENTS', 'NOTEEVENTS', 'OUTPUTEVENTS', 'PRESCRIPTIONS', 'PROCEDUREEVENTS', 'PROCEDURES_ICD']
events_sheet = [file for file in files for table in events_table_names if file.startswith(table)]
dictionary_sheet = [table for table in files if table.startswith('D_')]

In [None]:
events_sheet[0].strip('.csv.gz')

In [None]:
tables = tracking_sheet + events_sheet + dictionary_sheet

# Medication

In [None]:
dfp = pd.read_csv(join(mimic_path, 'PATIENTS.csv.gz'))
dfp[~dfp.DOD.isna()]

In [None]:
dfa[~dfa.DEATHTIME.isna()]

In [None]:
class MIMICPreprocessor_transfer(MIMIC3Preprocessor):
    def __init__(self, cfg, test=False):
        super(MIMICPreprocessor_transfer, self).__init__(cfg, test)
        self.concept_name = 'transfers'
    
    def __call__(self):
        df = self.load()
        df_hospital = self.get_concepts(df, 'ADMITTIME', 'DISCHTIME', 'HOSPITAL')
        df_emergency = self.get_concepts(df, 'EDREGTIME', 'EDOUTTIME', 'EMERGENCY')
        print(df_hospital)
        print(df_emergency)

    def load(self):
        df = pd.read_csv(join(self.raw_data_path, 'ADMISSIONS.csv.gz'), compression='gzip', 
            usecols=['SUBJECT_ID', 'HADM_ID','ADMITTIME', 'DISCHTIME','DEATHTIME',
                'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
                'EDREGTIME','EDOUTTIME'], 
            parse_dates=['ADMITTIME', 'DISCHTIME', 'EDREGTIME','EDOUTTIME', 'DEATHTIME'])
        return df

    def get_length_of_stay(self, df, start_col, end_col):
        """Get length of stay in days, or days until death, store as value"""
        df['VALUE'] = (df[end_col] - df[start_col]).dt.days
        mask = df.VALUE.isnull()
        df.loc[mask, 'VALUE'] = (df.loc[mask, 'DEATHDATE'] - df.loc[mask, start_col]).dt.days
        return df

    def convert_admission_discharge_to_events(self, df, start_col, end_col, concept_name):
        """Convert to events, store as start and end date"""
        dfdis = df.copy(deep=True).drop(columns=[start_col])
        df = df.rename(columns={start_col: 'TIMESTAMP'}).drop(columns=[end_col])
        if concept_name=='HOSPITAL':
            df['VALUE_CAT'] = df['ADMISSION_LOCATION']
            dfdis['VALUE_CAT'] = df['DISCHARGE_LOCATION']
        df['CONCEPT'] = f'T{concept_name}_ADMISSION'
        dfdis = dfdis.rename(columns={end_col: 'TIMESTAMP'})
        dfdis['CONCEPT'] = f'T{concept_name}_DISCHARGE'
        df = pd.concat([df, dfdis], axis=0)
        return df

    def get_concepts(self, df, start_col, end_col, concept_name):
        """Get concepts for admission and discharge, return in standard format"""
        df = df.loc[:, ['SUBJECT_ID', start_col, end_col, 'ADMISSION_TYPE', 'DOD']]
        df = self.get_length_of_stay(df, start_col, end_col)
        df = self.convert_admission_discharge_to_events(df, start_col, end_col, concept_name)
        return df


In [None]:
dfa = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'), compression='gzip', 
    parse_dates=['ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'EDREGTIME', 'EDOUTTIME'])

In [None]:
adfa = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'), compression='gzip', 
    parse_dates=['ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'EDREGTIME', 'EDOUTTIME'])
# dfe = dfa[dfa['ADMISSION_TYPE']=='EMERGENCY']
# dfe[['ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'EDREGTIME', 'EDOUTTIME']]
# dfe.loc[dfe.ADMITTIME<dfe.EDREGTIME, ['ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'EDREGTIME', 'EDOUTTIME']]

In [None]:
dfa

In [None]:
df = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'), compression='gzip', 
     parse_dates=['ADMITTIME', 'DISCHTIME'])
df

In [None]:
def update_metadata(concept_name, coding_sys, files_ls):
    print('concept_name: ', concept_name)
    print('coding_sys: ', coding_sys)
    print('files_ls: ', files_ls)
metadata_dic ={
            'diag': ['ICD9', ['DIAGNOSES_ICD.csv.gz', 'ADMISSIONS.csv.gz']],
            'med':['DrugName', ['PRESCRIPTIONS.csv.gz']]

        }
update_metadata('diag',*metadata_dic['diag'])

In [None]:
class MIMICPreprocessor_transfer:
    def __init__(self, test):
        print(test)
print(globals().keys())
class_ = globals()["MIMICPreprocessor_transfer"] 
#class_ = getattr(globals(), "MIMICPreprocessor_transfer")
instance = class_(True)

In [None]:
df = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'), compression='gzip', 
    parse_dates=['ADMITTIME', 'DISCHTIME'])
df

In [None]:
df = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'), compression='gzip', 
    usecols=['SUBJECT_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_TYPE', 'EDREGTIME','EDOUTTIME'], parse_dates=['ADMITTIME', 'DISCHTIME'])
dfp = pd.read_csv(join(mimic_path, 'PATIENTS.csv.gz'), compression='gzip',
    usecols=['SUBJECT_ID', 'DOD'], parse_dates=['DOD'])
df = df.merge(dfp, on='SUBJECT_ID', how='left')

dfa = df.loc[:, ['SUBJECT_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_TYPE']]
# use ADMITTIME and DISCHTIME as separate events to have a time series of events like hospitalization and discharge
dfa['VALUE'] = (dfa['DISCHTIME'] - dfa['ADMITTIME']).dt.days
dfd = dfa.copy(deep=True).drop(columns=['ADMITTIME'])
dfa = dfa.rename(columns={'ADMITTIME': 'TIME'}).drop(columns=['DISCHTIME'])
dfa['CONCEPT'] = 'THOSPITAL_ADMISSION'
dfd = dfd.rename(columns={'DISCHTIME': 'TIME'})
dfd['CONCEPT'] = 'THOSPITAL_DISCHARGE'
df = pd.concat([dfa, dfd], axis=0)
df.rename(columns={'ADMISSION_TYPE': 'VALUE_CAT'}, inplace=True)
df


In [None]:
df = pd.read_csv(join(mimic_path, 'PROCEDURES_ICD.csv.gz'), compression='gzip', nrows=10000,
    )
# df.NDC = df.NDC.astype('str')
# df['NDC'] = df.NDC.map(lambda x: x[1:])
# dfa = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'), compression='gzip', nrows=10000, parse_dates=['ADMITTIME'],
    # usecols=['HADM_ID', 'ADMITTIME'])
# dfh = pd.read_csv(join(data_path, 'helper', 'NDC.csv'), usecols=['ndcpackagecode'])
# dfh['ndcpackagecode_simple'] = dfh.ndcpackagecode.str.replace('-', '')
# turn into dict
# dic = dfh.set_index('ndcpackagecode_simple').to_dict()['ndcpackagecode']
# dic[''] = ''
#df['NDC'] = df.NDC.map(dic)
#df['len'] = df.ndcpackagecode.map(lambda x: len(x))

df

In [None]:
df[df.TIMESTAMP.isnull()]

In [None]:
df[df.CONCEPT=='Furosemide']

In [None]:
df_cat = df[df.VALUE.str.contains('-')].copy()
df_cat['VALUE_CAT'] = df_cat['VALUE']
df_cat['VALUE'] = df_cat.groupby('CONCEPT')['VALUE'].transform(lambda x: x.astype('category').cat.codes)
df_cat.drop(columns=['VALUENUM'], inplace=True)
df_cat['VALUE_UNIT'] = 'categorical'
df_cat

In [None]:
df

In [None]:
dfd['TIMESTAMP'] = dfd['HADM_ID'].map(adm_dic)
dfd.rename(columns={'SUBJECT_ID':'PID',  'HADM_ID':'ADMISSION_ID', 'ICD9_CODE':'CONCEPT'}, inplace=True)

In [None]:
dfd['CONCEPT'] = dfd['CONCEPT'].map(lambda x: 'D'+str(x))
dfd

In [None]:
dfd.dtypes

In [None]:
dfd = pd.read_csv(join(mimic_path, 'LABEVENTS.csv.gz'), compression='gzip', 
            nrows=30000, 
            parse_dates=['CHARTTIME'], dtype={'SUBJECT_ID': 'Int32', 'ITEMID': 'Int32', 'VALUE': 'str', 'VALUENUM': 'float32', 'VALUEUOM': 'str', 'HADM_ID': 'Int32'})
dfl = dfl.rename(columns={'SUBJECT_ID': 'PID', 'CHARTTIME': 'TIMESTAMP', 'VALUEUOM': 'VALUE_UNIT', 'HADM_ID': 'ADMISSION_ID'}).drop(columns=['ROW_ID', 'FLAG'])
dfl

In [None]:
dfld = pd.read_csv(join(mimic_path, 'D_LABITEMS.csv.gz'), compression='gzip')
dfld

In [None]:
dfld.loc[dfld['LABEL'].str.contains('SPECIMEN'), 'LABEL'] = dfld.loc[dfld['LABEL'].str.contains('SPECIMEN'), 'FLUID'] + ' ' + dfld.loc[dfld['LABEL'].str.contains('SPECIMEN'), 'CATEGORY']
dfld

In [None]:
item_code_dic = pd.Series(dfld.LOINC_CODE.values, index=dfld.ITEMID).to_dict()
item_name_dic = pd.Series(dfld[dfld.LOINC_CODE.isna()].LABEL.values, index=dfld[dfld.LOINC_CODE.isna()].ITEMID).to_dict()
# combine dicts
item_dic = {**item_code_dic, **item_name_dic}

In [None]:
dfl['CONCEPT'] = dfl.ITEMID.map(item_dic)

In [None]:
dfl

In [None]:
df_cont = dfl[dfl['VALUENUM'].notnull()]
df_cont = df_cont[df_cont['VALUENUM'] >= 0]
df_cont.drop(columns=['VALUE'], inplace=True)
df_cont.rename(columns={'VALUENUM': 'VALUE'}, inplace=True)
df_cont['VALUE_CAT'] = 'NaN'

In [None]:
df = pd.concat([df_cont, df_cat], axis=0)
df

In [None]:
df_cat = dfl[dfl['VALUENUM'].isnull()]
# how to avoid the warning :A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
# for this line
df_cat.loc[:,'VALUE_CAT'] = df_cat['VALUE']


In [None]:
df_cat['VALUE'] = df_cat.groupby('CONCEPT')['VALUE'].transform(lambda x: x.astype('category').cat.codes)
df_cat.drop(columns=['VALUENUM'], inplace=True)
df_cat['VALUE_UNIT'] = 'categorical'

In [None]:
df_cat = df_cat.rename(columns={'VALUE': 'VALUE_CAT'})
df_cont['VALUE_CAT'] = pd.Series(np.nan, index=df_cont.index)

In [None]:
df_cont

In [None]:
df_cat.concept = df_cat.concept.map(lambda x: 'L'+x)

In [None]:
df_cat[df_cat.concept=='LIntubated']

In [None]:
df_cat.attrs['SYSTEM'] = 'LOINC'

In [None]:
pa_tab = pa.Table.from_pandas(df_cat)
pa_tab.write_metadata = {'SYSTEM': 'LOINC'}

In [None]:
df_dt.VALUEUOM.unique()

In [None]:
load_csv(join(mimic_path, 'D_LABITEMS.csv.gz'))

In [None]:
df_adm = pd.read_csv(join(mimic_path, tracking_sheet[0]), compression='gzip')
df_adm

In [None]:
df_adm = pd.read_csv(join(mimic_path, 'ADMISSIONS.csv.gz'), compression='gzip')
print('length', len(df_adm))
print('unique patients', len(df_adm['SUBJECT_ID'].unique()))
df_adm.head()

In [None]:
df_co = pd.read_csv(join(mimic_path, 'CALLOUT.csv.gz'), compression='gzip')
df_co.head()
print('length', len(df_co))
print('unique patients', len(df_co['SUBJECT_ID'].unique()))
df_co.head()

In [None]:
df_co = ps.read_csv(join(mimic_path, 'CALLOUT.csv.gz'))
sys.getsizeof(df_co)

In [None]:
# not relevant for our analysis
df_cg = pd.read_csv(join(mimic_path, 'CAREGIVERS.csv.gz'), compression='gzip')

In [None]:
pd_df = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', nrows=50000)
#pd_df.to_parquet(join(mimic_path, 'CHARTEVENTS_sample.parquet'), index=False)

In [None]:
df_ps = pl.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'),  n_rows=5000, parse_dates=True)

In [None]:
# no information on type of event, only value and ID
df_ce = pd.read_csv(join(mimic_path, 'CHARTEVENTS.csv.gz'), compression='gzip', nrows=10000)
df_ce.head()

General approach:
- createa dataframe which contains: event_name, timestamp, value, visit, age 
- separate dataframes into batches