# Preprocessing: labevents Table

In [1]:
import os
import pickle
os.chdir('../../')
from utils.hosp_preprocess_util import *    # module of preprocessing functions

In [2]:
from utils.labs_preprocess_util import *

### Reading in and Preparing Labs for Long Format

In [3]:
mimic4_path = "./mimic-iv-1.0/hosp/labevents.csv.gz"
usecols = [1, 2, 4, 5, 7, 8, 9, 12]
dtypes = {
    'itemid':'int64',
    'subject_id':'int64',
    # 'hadm_id':'int64',            # hadm_id type not defined because it contains NaN values
    # 'charttime':'datetime64[ns]', # used as an argument in 'parse_cols' in pd.read_csv
    'value':'object',
    'valuenum':'float64',
    'valueuom':'object',
    'flag':'object'
}
labs = timestamp_cohort_data(mimic4_path, "./data/cohort.gzip", 'charttime', 'base_anchor_year', dtypes, usecols)
# labs.to_csv(".data/long_format/labs/labs.csv.gz", compression="gzip", index=False)

In [4]:
# Review data
labs.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,flag,admit_year,label,base_anchor_year,max_year_group,min_year_group,anchor_year,timedelta_days,timedelta_years
0,10000032,22841357.0,51277,2180-06-27 05:10:00,15.7,15.7,%,abnormal,2180,0,2174,2016,2014,2180,2369,6.0
1,10000032,,51274,2180-07-23 06:39:00,16.2,16.2,sec,abnormal,2180,0,2174,2016,2014,2180,2395,6.0
2,10000032,,51275,2180-07-23 06:39:00,32.4,32.4,sec,,2180,0,2174,2016,2014,2180,2395,6.0
3,10000032,,50861,2180-07-23 06:39:00,153.0,153.0,IU/L,abnormal,2180,0,2174,2016,2014,2180,2395,6.0
4,10000032,,50862,2180-07-23 06:39:00,3.6,3.6,g/dL,,2180,0,2174,2016,2014,2180,2395,6.0


### Imputing invalid hadm_ids

In [None]:
# labs = pd.read_csv("./data/long_format/labs/labs.csv.gz", compression="gzip", header=0)
adm = pd.read_csv("./mimic-iv-1.0/core/admissions.csv.gz", compression='gzip', header=0, index_col=None, usecols=['subject_id', 'hadm_id', 'admittime', 'dischtime'], parse_dates=['admittime', 'dischtime'])

In [5]:
# Use imputation function to impute missing hadm_ids where possible
imputed_labs = impute_hadm_ids(labs[['subject_id','hadm_id','itemid','charttime','value','valuenum','valueuom','label', 'timedelta']].copy(), adm)

  if (await self.run_code(code, result,  async_=asy)):


In [14]:
# Review output
imputed_labs.head()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,label,hadm_id_new,admittime,dischtime
0,0,18695057,26762941.0,50912,2137-05-31 21:23:00,1.0,1.0,mg/dL,0,26762941.0,2137-05-25 02:46:00,2137-06-01 14:25:00
1,1,18695057,26762941.0,50934,2137-05-31 21:23:00,8.0,8.0,,0,26762941.0,2137-05-25 02:46:00,2137-06-01 14:25:00
2,2,18695057,26762941.0,50947,2137-05-31 21:23:00,0.0,0.0,,0,26762941.0,2137-05-25 02:46:00,2137-06-01 14:25:00
3,3,18695057,26762941.0,50960,2137-05-31 21:23:00,1.6,1.6,mg/dL,0,26762941.0,2137-05-25 02:46:00,2137-06-01 14:25:00
4,4,18695057,26762941.0,50970,2137-05-31 21:23:00,4.0,4.0,mg/dL,0,26762941.0,2137-05-25 02:46:00,2137-06-01 14:25:00


In [None]:
# save imputed dataset
imputed_labs[['subject_id', 'hadm_id_new', 'itemid', 'charttime', 'valuenum']].dropna(subset=['subject_id','itemid', 'valuenum']).to_csv("./data/long_format/labs/preproc_labs_imputed.csv.gz", compression='gzip', index=False)

In [13]:
invalids = imputed_labs_gb.loc[imputed_labs_gb.hadm_id_new.isna()].shape[0]
print(f"{invalids}/{imputed_labs_gb.shape[0]} hadm_ids are invalid")
print(f"{labs.loc[labs.hadm_id.isna()].shape[0]} were originally invalid")

20216301/65793007 hadm_ids are invalid
28658188 were originally invalid


In [None]:
print("Unique, non-NaN features in \'value\'      ", labs.value.dropna().nunique())
print("Unique, non-NaN features in \'valuenum\'   ", labs.valuenum.dropna().nunique())

print("Total valid rows in \'value\'      ", labs.value.dropna().shape[0])
print("Total valid rows in \'valuenum\'   ", labs.valuenum.dropna().shape[0])
# labs.loc[(~labs.value.isna()) & (labs.valuenum.isna())].head()

### Saving the long-format dataset

In [None]:
# Save long format dataset
labs[['subject_id', 'hadm_id', 'itemid', 'charttime', 'valuenum']].dropna(subset=['subject_id','itemid', 'valuenum']).to_pickle("./data/long_format/labs/preproc_labs.gzip", compression='gzip')

### Processing data with multiple units of measure

Different units of measure lead to different value numbers, so one unit of measure must be applied (either by checking that the units are synonymous or removing the less frequent units)

In [None]:
# drop unnecessary columns and NaN values for faster processing
labs.drop(labels=['flag', 'admit_year','label','base_anchor_year','max_year_group','min_year_group','anchor_year'], inplace=True, axis=1)
labs.dropna(subset=['itemid', 'valuenum', 'valueuom'], inplace=True)

In [None]:
print("Number of unique labevent itemids in cohort: ", labs.itemid.dropna().nunique())

# Top 100 unique itemids per patient with valid valuenum values
top_items = list(labs[['subject_id', 'itemid','valuenum']].dropna().drop_duplicates(subset=['subject_id', 'itemid']).itemid.value_counts()[:100].index)

# non standard itemids; have more than 1 unit of measure
nonstd_itemids = list(labs[['itemid', 'valueuom']].dropna().drop_duplicates().groupby(by='itemid').size().loc[labs[['itemid', 'valueuom']].dropna().drop_duplicates().groupby(by='itemid').size() > 1].index)

In [None]:
nonstd_itemids
# 50915 -> ['ng/mL', 'ng/mL FEU'] same
# 50993 -> ['uIU/mL', 'uU/ML'] same
# 51099 -> ['mg/mg' 'Ratio'] diff, majority=ratio
# 51228 -> ['uIU/mL', 'uU/ML'] same
# 51249 -> ['%' 'g/dL'] diff, majority=%
# 51282 -> ['m/uL' '/mm3'] diff, majority=m/uL
# 51464 -> ['m/uL' '/mm3'] diff, majority=mg/dL

In [None]:
for idx in nonstd_itemids:
    print(idx)
    print(labs.loc[labs.itemid == idx].valueuom.unique())
    print(labs.loc[labs.itemid == idx].valueuom.value_counts())
    labs.loc[labs.itemid == idx].valueuom.value_counts().index[0]

In [None]:
labs_preproc = labs.copy()
print(labs_preproc.shape)
for i in [51249, 51282]:
    try:
        maj = labs_preproc.loc[labs_preproc.itemid == i].valueuom.value_counts().index[0]
        labs_preproc = labs_preproc.loc[~((labs_preproc.itemid == i) & (labs_preproc.valueuom == maj))]
        print(labs_preproc.shape)
    except IndexError:
        print(f"{idx} not found")

In [None]:
# Save long format dataset
labs_preproc[['subject_id', 'hadm_id', 'itemid', 'charttime', 'valuenum']].dropna(subset=['subject_id','itemid', 'charttime', 'valuenum']).to_csv("./data/long_format/labs/preproc_labs_units_cleaned.csv.gz", compression='gzip', index=False)

In [None]:
pd.read_pickle("./data/long_format/labs/long_labs_units_cleaned.gzip", compression='gzip')

### Test

In [None]:
adm = pd.read_pickle("./data/adm_cohort.gzip", compression='gzip')

In [None]:
test = labs.merge(adm[['hadm_id', 'admittime']], how='left', left_on='hadm_id', right_on='hadm_id')

In [None]:
test.loc[test.admittime > test.charttime]

In [None]:
pd.read_pickle('./data/long_format/labs/preproc_labs_units_cleaned.gzip', compression='gzip').to_csv('./data/long_format/labs/preproc_labs_units_cleaned.csv.gz', compression='gzip', index=False)