# MIMIC 4 data - dataset construction inputevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [1]:
import os
import pathlib
import pandas as pd

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [3]:
path_data = p_project + '/data/mimic4'

In [4]:
adm = pd.read_csv(path_data + '/processed/admissions_processed.csv')

In [5]:
outputs = pd.read_csv(path_data + '/raw/icu/outputevents.csv.gz')
outputs.tail()

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom
4234962,19999987,23865745,36195440,86756,2145-11-03 05:00:00,2145-11-03 05:48:00,226559,25.0,ml
4234963,19999987,23865745,36195440,86756,2145-11-03 05:48:00,2145-11-03 05:48:00,226576,300.0,ml
4234964,19999987,23865745,36195440,86756,2145-11-03 07:00:00,2145-11-03 07:00:00,226559,30.0,ml
4234965,19999987,23865745,36195440,90295,2145-11-02 23:25:00,2145-11-02 23:25:00,226627,200.0,ml
4234966,19999987,23865745,36195440,91879,2145-11-04 20:03:00,2145-11-04 20:03:00,226560,325.0,ml


In [6]:
# only choose previously selected admission ids
adm_ids=list(adm['hadm_id'])
outputs=outputs.loc[outputs['hadm_id'].isin(adm_ids)]

print('Number of patients remaining in the database: ')
print(outputs['subject_id'].nunique())

Number of patients remaining in the database: 
41780


In [7]:
# get item names
item_id=pd.read_csv(path_data + '/raw/icu/d_items.csv.gz')
item_id_1=item_id[['itemid','label']]
item_id_1.head()

outputs_2=pd.merge(outputs,item_id_1,on='itemid')
outputs_2.head()
print('Number of patients remaining in the database: ')
print(outputs_2['subject_id'].nunique())

Number of patients remaining in the database: 
41780


In [8]:
#Select entries
outputs_label_list=['Foley', 'Void', 'OR Urine', 'Chest Tube #1', 'Oral Gastric', 'Pre-Admission', 'TF Residual', 'OR EBL', 'Emesis', 'Nasogastric', 'Stool', 'Jackson Pratt #1', 'Straight Cath', 'TF Residual Output', 'Fecal Bag']
outputs_bis=outputs_2.loc[outputs_2['label'].isin(outputs_label_list)].copy()

print('Number of patients remaining in the database: ')
print(outputs_bis['subject_id'].nunique())
print('Number of datapoints remaining in the database: ')
print(len(outputs_bis.index))

outputs_3=outputs_bis.copy()

Number of patients remaining in the database: 
41621
Number of datapoints remaining in the database: 
3506859


In [9]:
# Verification that all input labels have the same amounts units
outputs_3.groupby('label')['valueuom'].value_counts() 

label               valueuom
Chest Tube #1       ml           249449
Emesis              ml             7831
Fecal Bag           ml            11726
Foley               ml          2749279
Jackson Pratt #1    ml            53454
Nasogastric         ml            27250
OR EBL              ml             8903
OR Urine            ml            16155
Oral Gastric        ml            23153
Pre-Admission       ml            10751
Stool               ml            13590
Straight Cath       ml             9375
TF Residual         ml            75858
TF Residual Output  ml             7366
Void                ml           242719
Name: valueuom, dtype: int64

In [10]:
outputs_3.to_csv(path_data + '/processed/outputs_processed.csv')