# Preprocessing: labevents Table

In [1]:
import os
import pickle
os.chdir('../')
from utils.hosp_preprocess_util import *    # module of preprocessing functions

### Reading in and Preparing Labs for Long Format

In [2]:
mimic4_path = "./mimic-iv-1.0/hosp/labevents.csv.gz"
usecols = [1, 2, 4, 5, 7, 8, 9, 12]
dtypes = {
    'itemid':'int64',
    'subject_id':'int64',
    # 'hadm_id':'int64',            # hadm_id type not defined because it contains NaN values
    # 'charttime':'datetime64[ns]', # used as an argument in 'parse_cols' in pd.read_csv
    'value':'object',
    'valuenum':'float64',
    'valueuom':'object',
    'flag':'object'
}
labs = timestamp_cohort_data(mimic4_path, "./data/cohort.gzip", 'charttime', 'base_anchor_year', dtypes, usecols)

In [3]:
# Review data
labs.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,flag,admit_year,label,base_anchor_year,max_year_group,min_year_group,anchor_year,timedelta_days,timedelta_years
68725657,10000032,22841357.0,51277,2180-06-27 05:10:00,15.7,15.7,%,abnormal,2180,0,2174,2016,2014,2180,2369,6.0
68725675,10000032,,51274,2180-07-23 06:39:00,16.2,16.2,sec,abnormal,2180,0,2174,2016,2014,2180,2395,6.0
68725676,10000032,,51275,2180-07-23 06:39:00,32.4,32.4,sec,,2180,0,2174,2016,2014,2180,2395,6.0
68725677,10000032,,50861,2180-07-23 06:39:00,153.0,153.0,IU/L,abnormal,2180,0,2174,2016,2014,2180,2395,6.0
68725678,10000032,,50862,2180-07-23 06:39:00,3.6,3.6,g/dL,,2180,0,2174,2016,2014,2180,2395,6.0


In [4]:
print(labs.value.isna().sum())
print(labs.valuenum.isna().sum())
labs.loc[(~labs.value.isna()) & (labs.valuenum.isna())].head()

7297609
9540521


Unnamed: 0,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,flag,admit_year,label,base_anchor_year,max_year_group,min_year_group,anchor_year,timedelta_days,timedelta_years
68725921,10000032,22841357.0,51519,2180-06-26 22:45:00,NONE,,,,2180,0,2174,2016,2014,2180,2368,6.0
68725920,10000032,,51079,2180-06-22 11:15:00,NEG,,,,2180,0,2174,2016,2014,2180,2364,6.0
68725919,10000032,,51519,2180-06-26 18:30:00,NONE,,,,2180,0,2174,2016,2014,2180,2368,6.0
68725918,10000032,,51074,2180-06-22 11:15:00,NEG,,,,2180,0,2174,2016,2014,2180,2364,6.0
68725917,10000032,,51071,2180-06-22 11:15:00,NEG,,,,2180,0,2174,2016,2014,2180,2364,6.0


### Saving the long-format dataset

In [5]:
# Save long format dataset
labs[['subject_id', 'hadm_id', 'itemid', 'timedelta_days', 'valuenum']].dropna(subset=['subject_id','itemid', 'timedelta_days', 'valuenum']).to_pickle("./data/long_format/labs/long_labs.gzip", compression='gzip')

In [6]:
labs

Unnamed: 0,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,flag,admit_year,label,base_anchor_year,max_year_group,min_year_group,anchor_year,timedelta_days,timedelta_years
68725657,10000032,22841357.0,51277,2180-06-27 05:10:00,15.7,15.7,%,abnormal,2180,0,2174,2016,2014,2180,2369,6.0
68725675,10000032,,51274,2180-07-23 06:39:00,16.2,16.2,sec,abnormal,2180,0,2174,2016,2014,2180,2395,6.0
68725676,10000032,,51275,2180-07-23 06:39:00,32.4,32.4,sec,,2180,0,2174,2016,2014,2180,2395,6.0
68725677,10000032,,50861,2180-07-23 06:39:00,153,153.0,IU/L,abnormal,2180,0,2174,2016,2014,2180,2395,6.0
68725678,10000032,,50862,2180-07-23 06:39:00,3.6,3.6,g/dL,,2180,0,2174,2016,2014,2180,2395,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68720055,19999987,23865745.0,50902,2145-11-07 06:00:00,108,108.0,mEq/L,,2145,0,2142,2013,2011,2145,1406,3.0
68720056,19999987,23865745.0,50912,2145-11-07 06:00:00,0.9,0.9,mg/dL,,2145,0,2142,2013,2011,2145,1406,3.0
68720057,19999987,23865745.0,50960,2145-11-07 06:00:00,1.8,1.8,mg/dL,,2145,0,2142,2013,2011,2145,1406,3.0
68720051,19999987,23865745.0,51301,2145-11-07 06:00:00,5.0,5.0,K/uL,,2145,0,2142,2013,2011,2145,1406,3.0


### Processing data with multiple units of measure

Different units of measure lead to different value numbers, so one unit of measure must be applied (either by checking that the units are synonymous or removing the less frequent units)

In [7]:
# drop unnecessary columns and NaN values for faster processing
labs.drop(labels=['charttime', 'value', 'flag', 'admit_year','label','base_anchor_year','max_year_group','min_year_group','anchor_year'], inplace=True, axis=1)
labs.dropna(subset=['itemid', 'valuenum', 'valueuom'], inplace=True)

In [8]:
print("Number of unique labevent itemids in cohort: ", labs.itemid.dropna().nunique())

# Top 100 unique itemids per patient with valid valuenum values
top_items = list(labs[['subject_id', 'itemid','valuenum']].dropna().drop_duplicates(subset=['subject_id', 'itemid']).itemid.value_counts()[:100].index)

# non standard itemids; have more than 1 unit of measure
nonstd_itemids = list(labs[['itemid', 'valueuom']].dropna().drop_duplicates().groupby(by='itemid').size().loc[labs[['itemid', 'valueuom']].dropna().drop_duplicates().groupby(by='itemid').size() > 1].index)

Number of unique labevent itemids in cohort:  463


In [9]:
nonstd_itemids
# 50915 -> ['ng/mL', 'ng/mL FEU'] same
# 50993 -> ['uIU/mL', 'uU/ML'] same
# 51099 -> ['mg/mg' 'Ratio'] diff, majority=ratio
# 51228 -> ['uIU/mL', 'uU/ML'] same
# 51249 -> ['%' 'g/dL'] diff, majority=%
# 51282 -> ['m/uL' '/mm3'] diff, majority=m/uL
# 51464 -> ['m/uL' '/mm3'] diff, majority=mg/dL

[50915, 50993, 51099, 51228, 51249]

In [10]:
for idx in nonstd_itemids:
    print(idx)
    print(labs.loc[labs.itemid == idx].valueuom.unique())
    print(labs.loc[labs.itemid == idx].valueuom.value_counts())
    labs.loc[labs.itemid == idx].valueuom.value_counts().index[0]

50915
['ng/mL' 'ng/mL FEU']
ng/mL        11196
ng/mL FEU      369
Name: valueuom, dtype: int64
50993
['uIU/mL' 'uU/ML']
uIU/mL    170373
uU/ML          6
Name: valueuom, dtype: int64
51099
['mg/mg' 'Ratio']
Ratio    29118
mg/mg    16842
Name: valueuom, dtype: int64
51228
['U/mL' 'IU/mL']
U/mL     804
IU/mL     17
Name: valueuom, dtype: int64
51249
['%' 'g/dL']
%       1666211
g/dL     231385
Name: valueuom, dtype: int64


In [11]:
labs_preproc = labs.copy()
print(labs_preproc.shape)
for i in [51249, 51282]:
    try:
        maj = labs_preproc.loc[labs_preproc.itemid == i].valueuom.value_counts().index[0]
        labs_preproc = labs_preproc.loc[~((labs_preproc.itemid == i) & (labs_preproc.valueuom == maj))]
        print(labs_preproc.shape)
    except IndexError:
        print(f"{idx} not found")

(54849725, 7)
(53183514, 7)
(53180040, 7)


In [12]:
# Save long format dataset
labs_preproc[['subject_id', 'hadm_id', 'itemid', 'timedelta_days', 'valuenum']].dropna(subset=['subject_id','itemid', 'timedelta_days', 'valuenum']).to_pickle("./data/long_format/labs/long_labs_units_cleaned.gzip", compression='gzip')

In [13]:
pd.read_pickle("./data/long_format/labs/long_labs_units_cleaned.gzip", compression='gzip')

Unnamed: 0,subject_id,hadm_id,itemid,timedelta_days,valuenum
68725657,10000032,22841357.0,51277,2369,15.7
68725675,10000032,,51274,2395,16.2
68725676,10000032,,51275,2395,32.4
68725677,10000032,,50861,2395,153.0
68725678,10000032,,50862,2395,3.6
...,...,...,...,...,...
68720055,19999987,23865745.0,50902,1406,108.0
68720056,19999987,23865745.0,50912,1406,0.9
68720057,19999987,23865745.0,50960,1406,1.8
68720051,19999987,23865745.0,51301,1406,5.0


### One Hot Encoding

Old code for pivoting long_format data into the form:
```
                                    || feature 1 || ... || feature n ||
|| subject_id || label || timedelta ||
```

In [None]:
# # Pivots data into correct format; subject_id and timedelta as indices. Only take top 100 itemids
# pivot_labs = pivot_cohort(labs, "labs_", target_col='itemid',  values='valuenum', ohe=False, max_features=100)
# # Save output
# pivot_labs.to_pickle('./data/labs/pivot_labs.gzip', compression='gzip')