# Preprocessing: `chartevents` Table

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
os.chdir('../../')
from tqdm import tqdm
# from utils.icu_preprocess_util import *   
from utils.labs_preprocess_util import *

### Define `chartevent` arguments

In [2]:
mimic4_path = "./mimic-iv-1.0/"
usecols = [
    "subject_id",
    "hadm_id",
    "stay_id",
    "charttime",
    "itemid",
    "valuenum",
    "valueuom"
]

dtypes = {
    'itemid':'int64',
    'subject_id':'int64',
    'stay_id':'int64',
    # 'hadm_id':'int64',            # hadm_id type not defined because it contains NaN values
    # 'charttime':'datetime64[ns]', # used as an argument in 'parse_cols' in pd.read_csv
    'valuenum':'float64',
    'valueuom':'object',
}

### Define modified `preproc_chart` to read in `chartevents` table

In [3]:
def preproc_chart(dataset_path: str, cohort_path:str, time_col:str, dtypes: dict, usecols: list) -> pd.DataFrame:
    """Function for getting hosp observations pertaining to a pickled cohort. Function is structured to save memory when reading and transforming data."""
    
    # Only consider values in our cohort
    cohort = pd.read_csv(cohort_path, compression='gzip', parse_dates = ['intime'])
    df_cohort=pd.DataFrame()
        # read module w/ custom params
    chunksize = 5000000
    for chunk in tqdm(pd.read_csv(dataset_path, compression='gzip', usecols=usecols, dtype=dtypes, parse_dates=[time_col],chunksize=chunksize)):
        #print(chunk.head())
        # chunk['valuenum']=chunk['valuenum'].fillna(0) # <<< commented out this line
        chunk_merged=chunk.merge(cohort[['stay_id', 'intime']], how='inner', left_on='stay_id', right_on='stay_id')
        chunk_merged['event_time_from_admit'] = chunk_merged[time_col] - chunk_merged['intime']
        
        del chunk_merged[time_col] 
        del chunk_merged['intime']
        chunk_merged=chunk_merged.dropna()
        chunk_merged=chunk_merged.drop_duplicates()
        
        if df_cohort.empty:
            df_cohort=chunk_merged
        else:
            df_cohort=df_cohort.append(chunk_merged, ignore_index=True)

    # Print unique counts and value_counts
    print("# Unique Events:  ", df_cohort.itemid.dropna().nunique())
    print("# Admissions:  ", df_cohort.stay_id.nunique())
    print("Total rows", df_cohort.shape[0])

    # Only return module measurements within the observation range, sorted by subject_id
    return df_cohort

In [4]:
# get chartevents data for our cohort
charts = preproc_chart(mimic4_path + "icu/chartevents.csv.gz", mimic4_path + "icu/icustays.csv.gz", "charttime", dtypes, usecols)

66it [08:02,  7.32s/it]


# Unique Events:   454
# Admissions:   76529
Total rows 81030530


In [5]:
charts.head()

Unnamed: 0,subject_id,hadm_id,stay_id,itemid,valuenum,valueuom,event_time_from_admit
0,10003700,28623837,30600691,220179,152.0,mmHg,-1 days +23:45:00
1,10003700,28623837,30600691,220180,97.0,mmHg,-1 days +23:45:00
2,10003700,28623837,30600691,220181,110.0,mmHg,-1 days +23:45:00
3,10003700,28623837,30600691,220045,65.0,bpm,-1 days +23:47:00
4,10003700,28623837,30600691,220210,14.0,insp/min,-1 days +23:47:00


In [5]:
# Drop NaN values for itemid, valuenum, and valueuom
for col in ['itemid', 'valuenum', 'valueuom']:
    print(f"NaN rows in {col}:", charts['itemid'].loc[charts['itemid'].isna()].shape[0])
    
charts.dropna(subset=['itemid', 'valuenum', 'valueuom'], inplace=True)
print(charts.shape)

NaN rows in itemid: 0
NaN rows in valuenum: 0
NaN rows in valueuom: 0
(81030530, 7)


In [6]:
# non standard itemids; have more than 1 unit of measure
nonstd_itemids = list(charts[['itemid', 'valueuom']].dropna().drop_duplicates().groupby(by='itemid').size().loc[charts[['itemid', 'valueuom']].dropna().drop_duplicates().groupby(by='itemid').size() > 1].index)
nonstd_itemids

[227441, 229357, 229358, 229360]

In [8]:
charts_preproc = charts.copy()
print("chartevents size before filtering for majority uom", charts_preproc.shape)
for i in nonstd_itemids:
    try:
        maj = charts_preproc.loc[charts_preproc.itemid == i].valueuom.value_counts().index[0]
        charts_preproc = charts_preproc.loc[~((charts_preproc.itemid == i) & (charts_preproc.valueuom == maj))]
    except IndexError:
        print(f"{idx} not found")

print("chartevents size after filtering for majority uom", charts_preproc.shape)

chartevents size before filtering for majority uom (81030530, 7)
chartevents size after filtering for majority uom (80961394, 7)
