# MIMIC 4 data - dataset construction inputevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [1]:
import os
import pathlib
import pandas as pd
from datetime import datetime
from datetime import timedelta
import numpy as np

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
path_data = p_project + '/data/mimic4'

In [3]:
adm_3 = pd.read_csv(path_data + '/processed/admissions_processed.csv')

In [4]:
# only choose previously selected admission ids
inputs=pd.read_csv(path_data + '/raw/icu/inputevents.csv.gz')
adm_ids=list(adm_3['hadm_id'])
inputs=inputs.loc[inputs['hadm_id'].isin(adm_ids)]
inputs.head()

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,...,ordercomponenttypedescription,ordercategorydescription,patientweight,totalamount,totalamountuom,isopenbag,continueinnextdept,statusdescription,originalamount,originalrate
18,10001217,24597018,37067082,7355,2157-11-21 19:39:00,2157-11-21 19:40:00,2157-11-21 19:39:00,225154,2.0,mg,...,Main order parameter,Drug Push,71.2,,,0,0,FinishedRunning,2.0,2.0
19,10001217,24597018,37067082,7355,2157-11-21 21:28:00,2157-11-21 21:29:00,2157-11-21 21:28:00,226452,50.0,ml,...,Main order parameter,Bolus,71.2,50.0,ml,0,0,FinishedRunning,50.0,50.0
20,10001217,24597018,37067082,7355,2157-11-21 21:28:00,2157-11-21 21:29:00,2157-11-21 21:28:00,228315,1000.0,mg,...,Main order parameter,Drug Push,71.2,,,0,0,FinishedRunning,1000.0,999.999939
21,10001217,24597018,37067082,71578,2157-11-21 00:51:00,2157-11-21 02:06:00,2157-11-21 00:52:00,226089,249.999994,ml,...,Main order parameter,Continuous IV,71.2,250.0,ml,0,0,FinishedRunning,250.0,200.0
22,10001217,24597018,37067082,72136,2157-11-21 08:00:00,2157-11-21 08:01:00,2157-11-21 15:58:00,225975,1.0,dose,...,Main order parameter,Drug Push,71.2,,,0,0,FinishedRunning,1.0,1.0


In [5]:
# only keep columns of interest
inputs_small=inputs[['subject_id','hadm_id','starttime','endtime','itemid','amount','amountuom','rate','rateuom','patientweight','ordercategorydescription']]
print('Number of patients remaining in the database: ')
print(inputs_small['subject_id'].nunique())

Number of patients remaining in the database: 
42066


In [6]:
# get item ids for inputs 
item_id=pd.read_csv(path_data + '/raw/icu/d_items.csv.gz')
item_id_1=item_id[['itemid','label']]
item_id_1.head()

inputs_small_2=pd.merge(inputs_small,item_id_1,on='itemid')
inputs_small_2.head()
print('Number of patients remaining in the database: ')
print(inputs_small_2['subject_id'].nunique())

Number of patients remaining in the database: 
42066


In [7]:
#Only select specific labels for the inputs.
#list of retained inputs :
retained_list=["Albumin 5%","Dextrose 5%","Lorazepam (Ativan)","Calcium Gluconate","Midazolam (Versed)","Phenylephrine","Furosemide (Lasix)","Norepinephrine","Magnesium Sulfate","Nitroglycerin","Insulin - Glargine","Insulin - Humalog","Insulin - Regular","Heparin Sodium","Morphine Sulfate","Potassium Chloride","Packed Red Blood Cells","Gastric Meds","D5 1/2NS","LR","K Phos","Solution","Sterile Water","Metoprolol","Piggyback","OR Crystalloid Intake","OR Cell Saver Intake","PO Intake","GT Flush","KCL (Bolus)","Magnesium Sulfate (Bolus)"]
#missing :Fresh Frozen Plasma
inputs_small_3=inputs_small_2.loc[inputs_small_2["label"].isin(retained_list)].copy()
print(inputs_small_3['label'].unique())

['Morphine Sulfate' 'PO Intake' 'Piggyback' 'Dextrose 5%'
 'Magnesium Sulfate' 'Sterile Water' 'Lorazepam (Ativan)' 'Solution'
 'Gastric Meds' 'GT Flush' 'Heparin Sodium' 'Midazolam (Versed)'
 'Nitroglycerin' 'Insulin - Regular' 'Albumin 5%' 'Insulin - Glargine'
 'Furosemide (Lasix)' 'Phenylephrine' 'OR Crystalloid Intake'
 'OR Cell Saver Intake' 'LR' 'Potassium Chloride' 'KCL (Bolus)'
 'Calcium Gluconate' 'Packed Red Blood Cells' 'Magnesium Sulfate (Bolus)'
 'Norepinephrine' 'Insulin - Humalog' 'K Phos' 'Metoprolol' 'D5 1/2NS']


In [8]:
# print all content and check the commonly used metrics of each measurement
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(inputs_small_3.groupby('label')['amountuom'].value_counts())

label                      amountuom  
Albumin 5%                 ml              24231
Calcium Gluconate          grams           56516
D5 1/2NS                   ml              25270
                           L                   3
Dextrose 5%                ml             942824
Furosemide (Lasix)         mg              92557
GT Flush                   ml             195686
                           cm3                 1
Gastric Meds               ml             264344
                           L                   1
Heparin Sodium             units           67938
Insulin - Glargine         units           27554
Insulin - Humalog          units           77685
Insulin - Regular          units          261278
K Phos                     mmol             8275
KCL (Bolus)                ml              92846
LR                         ml              99889
                           L                   2
Lorazepam (Ativan)         mg              33846
Magnesium Sulfate          gra

In [9]:
##Cleaning the Heparin Sodium (Prophylaxis) (remove the non dose)
#inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Heparin Sodium (Prophylaxis)') & (inputs_small_3['amountuom']!='dose')].index).copy()

#Cleaning the Magnesium Sulfate (remove the non grams)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Magnesium Sulfate') & (inputs_small_3['amountuom']!='grams')].index).copy()

#Cleaning the Metoprolol (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Metoprolol') & (inputs_small_3['amountuom']!='mg')].index).copy()

#Cleaning the D5 1/2NS (keep ml)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='D5 1/2NS') & (inputs_small_3['amountuom']!='ml')].index).copy()

#Cleaning the LR
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='LR') & (inputs_small_3['amountuom']!='ml')].index).copy()

#Cleaning the OR Crystalloid Intake 
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='OR Crystalloid Intake') & (inputs_small_3['amountuom']!='ml')].index).copy()

#Cleaning the PO Intake
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='PO Intake') & (inputs_small_3['amountuom']!='ml')].index).copy()

#Cleaning the Gastric Meds
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Gastric Meds') & (inputs_small_3['amountuom']!='ml')].index).copy()

#Cleaning the GT Flush
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='GT Flush') & (inputs_small_3['amountuom']!='ml')].index).copy()

#Cleaning the Potassium Chloride
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Potassium Chloride') & (inputs_small_3['amountuom']!='mEq')].index).copy()

In [10]:
# Verify
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(inputs_small_3.groupby('label')['amountuom'].value_counts())

label                      amountuom
Albumin 5%                 ml            24231
Calcium Gluconate          grams         56516
D5 1/2NS                   ml            25270
Dextrose 5%                ml           942824
Furosemide (Lasix)         mg            92557
GT Flush                   ml           195686
Gastric Meds               ml           264344
Heparin Sodium             units         67938
Insulin - Glargine         units         27554
Insulin - Humalog          units         77685
Insulin - Regular          units        261278
K Phos                     mmol           8275
KCL (Bolus)                ml            92846
LR                         ml            99889
Lorazepam (Ativan)         mg            33846
Magnesium Sulfate          grams         65760
Magnesium Sulfate (Bolus)  ml            47096
Metoprolol                 mg            54874
Midazolam (Versed)         mg            95589
Morphine Sulfate           mg            68543
Nitroglycerin          

In [11]:
# same thing for inputs given in rates
inputs_small_3.groupby('label')['rateuom'].value_counts()

label                      rateuom      
Albumin 5%                 mL/hour           24231
D5 1/2NS                   mL/hour           25069
Dextrose 5%                mL/hour          574205
                           mL/min                2
                           mL/kg/hour            1
Furosemide (Lasix)         mg/hour           18084
Heparin Sodium             units/hour        60759
                           units/kg/hour         2
Insulin - Regular          units/hour       126305
KCL (Bolus)                mL/hour           92846
LR                         mL/hour           54277
Lorazepam (Ativan)         mg/hour             181
Magnesium Sulfate (Bolus)  mL/hour           47094
                           mL/min                2
Midazolam (Versed)         mg/hour           45185
Morphine Sulfate           mg/hour            6471
Nitroglycerin              mcg/kg/min        72773
Norepinephrine             mcg/kg/min       304480
                           mg/kg/min     

In [12]:
#Cleaning of Dextrose 5%  (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Dextrose 5%') & (inputs_small_3['rateuom']!='mL/hour')].index).copy()

#Cleaning of Magnesium Sulfate (Bolus)  (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Magnesium Sulfate (Bolus)') & (inputs_small_3['rateuom']!='mL/hour')].index).copy()

#Cleaning of Piggyback (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Piggyback') & (inputs_small_3['rateuom']!='mL/hour')].index).copy()

#Cleaning of Packed Red Bllod Cells (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Packed Red Blood Cells') & (inputs_small_3['rateuom']!='mL/hour')].index).copy()

#Cleaning of Phenylephrine
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Phenylephrine') & (inputs_small_3['rateuom']!='mcg/kg/min')].index).copy()

#Cleaning of Sterile Water 
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Sterile Water') & (inputs_small_3['rateuom']!='mL/hour')].index).copy()

#Cleaning of Heparin Sodium 
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Heparin Sodium') & (inputs_small_3['rateuom']!='units/hour')].index).copy()

#Cleaning of Norepinephrine
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Norepinephrine') & (inputs_small_3['rateuom']!='mcg/kg/min')].index).copy()

#Cleaning of Solution
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3['label']=='Solution') & (inputs_small_3['rateuom']!='mL/hour')].index).copy()

#Check if a single unit per drug
inputs_small_3.groupby('label')['rateuom'].value_counts()

label                      rateuom   
Albumin 5%                 mL/hour        24231
D5 1/2NS                   mL/hour        25069
Dextrose 5%                mL/hour       574205
Furosemide (Lasix)         mg/hour        18084
Heparin Sodium             units/hour     60759
Insulin - Regular          units/hour    126305
KCL (Bolus)                mL/hour        92846
LR                         mL/hour        54277
Lorazepam (Ativan)         mg/hour          181
Magnesium Sulfate (Bolus)  mL/hour        47094
Midazolam (Versed)         mg/hour        45185
Morphine Sulfate           mg/hour         6471
Nitroglycerin              mcg/kg/min     72773
Norepinephrine             mcg/kg/min    304480
Packed Red Blood Cells     mL/hour        47740
Phenylephrine              mcg/kg/min    177298
Piggyback                  mL/hour        85499
Solution                   mL/hour       430113
Sterile Water              mL/hour        57881
Name: rateuom, dtype: int64

We now split the entries which are spread in time.
We chose the duration window for the sampling. here we choose 30 minutes. So every entry which has a rate and with duration larger than 1 hour, we split it into fixed times injections.

In [13]:
# We now split the entries which are spread in time. We chose the duration window for the sampling. here we choose 30 minutes. 
# So every entry which has a rate and with duration larger than 1 hour, we split it into fixed times injections.

#First check the /hours units
df_temp=inputs_small_3.loc[(inputs_small_3['rate'].notnull()) & (inputs_small_3['rateuom'].str.contains('mcg/kg/hour'))].copy()
df_temp['computed_amount']=df_temp['rate']*((pd.to_datetime(df_temp['endtime'])-pd.to_datetime(df_temp['starttime'])).dt.total_seconds()/3600)*df_temp['patientweight']

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp['computed_amount']-1000*df_temp['amount'])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3['rate'].notnull()) & (inputs_small_3['rateuom'].str.contains('mL/hour'))].copy()
df_temp['computed_amount']=df_temp['rate']*((pd.to_datetime(df_temp['endtime'])-pd.to_datetime(df_temp['starttime'])).dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp['computed_amount']-df_temp['amount'])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3['rate'].notnull()) & (inputs_small_3['rateuom'].str.contains('mg/hour'))].copy()
df_temp['computed_amount']=df_temp['rate']*((pd.to_datetime(df_temp['endtime'])-pd.to_datetime(df_temp['starttime'])).dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp['computed_amount']-df_temp['amount'])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3['rate'].notnull()) & (inputs_small_3['rateuom'].str.contains('mcg/hour'))].copy()
df_temp['computed_amount']=df_temp['rate']*((pd.to_datetime(df_temp['endtime'])-pd.to_datetime(df_temp['starttime'])).dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp['computed_amount']-df_temp['amount'])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3['rate'].notnull()) & (inputs_small_3['rateuom'].str.contains('units/hour'))].copy()
df_temp['computed_amount']=df_temp['rate']*((pd.to_datetime(df_temp['endtime'])-pd.to_datetime(df_temp['starttime'])).dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp['computed_amount']-df_temp['amount'])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3['rate'].notnull()) & (inputs_small_3['rateuom'].str.contains('mg/min'))].copy()
df_temp['computed_amount']=df_temp['rate']*((pd.to_datetime(df_temp['endtime'])-pd.to_datetime(df_temp['starttime'])).dt.total_seconds()/60)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp['computed_amount']-df_temp['amount'])>0.01)].index)==0) #OK

#Third check the kg/min units
df_temp=inputs_small_3.loc[(inputs_small_3['rate'].notnull()) & (inputs_small_3['rateuom'].str.contains('mcg/kg/min'))].copy()
df_temp['computed_amount']=df_temp['rate']*((pd.to_datetime(df_temp['endtime'])-pd.to_datetime(df_temp['starttime'])).dt.total_seconds()/60)*df_temp['patientweight']

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp['computed_amount']/1000-df_temp['amount'])>0.01)].index)==0) #OK

In [14]:
duration_split_hours=0.5
to_sec_fact=3600*duration_split_hours

#split data set in four.

#The first dataframe contains the entries with no rate but with extended duration inputs (over 0.5 hour)
df_temp1=inputs_small_3.loc[((pd.to_datetime(inputs_small_3['endtime'])-pd.to_datetime(inputs_small_3['starttime']))>timedelta(hours=duration_split_hours)) & (inputs_small_3['rate'].isnull())].copy().reset_index(drop=True)
#The second dataframe contains the entries with no rate and low duration entries (<0.5hour)
df_temp2=inputs_small_3.loc[((pd.to_datetime(inputs_small_3['endtime'])-pd.to_datetime(inputs_small_3['starttime']))<=timedelta(hours=duration_split_hours)) & (inputs_small_3['rate'].isnull())].copy().reset_index(drop=True)
#The third dataframe contains the entries with a rate and extended duration inputs (over 0.5 hour)
df_temp3=inputs_small_3.loc[((pd.to_datetime(inputs_small_3['endtime'])-pd.to_datetime(inputs_small_3['starttime']))>timedelta(hours=duration_split_hours)) & (inputs_small_3['rate'].notnull())].copy().reset_index(drop=True)
#The forth dataframe contains the entries with a rate and low duration entries (< 0.5 hour)
df_temp4=inputs_small_3.loc[((pd.to_datetime(inputs_small_3['endtime'])-pd.to_datetime(inputs_small_3['starttime']))<=timedelta(hours=duration_split_hours)) & (inputs_small_3['rate'].notnull())].copy().reset_index(drop=True)

#Check if split is complete
assert(len(df_temp1.index)+len(df_temp2.index)+len(df_temp3.index)+len(df_temp4.index)==len(inputs_small_3.index))

In [15]:
#We then process all of these dfs.
#In the first one, we need to duplicate the entries according to their duration and then divide each entry by the number of duplicates

#We duplicate the rows with the number bins for each injection
df_temp1['Repeat']=np.ceil((pd.to_datetime(df_temp1['endtime'])-pd.to_datetime(df_temp1['starttime'])).dt.total_seconds()/to_sec_fact).astype(int)
df_new1=df_temp1.reindex(df_temp1.index.repeat(df_temp1['Repeat']))

#We then create the admninistration time as a shifted version of the STARTTIME.
df_new1['charttime']=df_new1.groupby(level=0)['starttime'].transform(lambda x: pd.date_range(start=x.iat[0],freq=str(60*duration_split_hours)+'min',periods=len(x)))
#We divide each entry by the number of repeats
df_new1['amount']=df_new1['amount']/df_new1['Repeat']

In [16]:
# In the third one, we do the same
#We duplicate the rows with the number bins for each injection
df_temp3['Repeat']=np.ceil((pd.to_datetime(df_temp3['endtime'])-pd.to_datetime(df_temp3['starttime'])).dt.total_seconds()/to_sec_fact).astype(int)
df_new3=df_temp3.reindex(df_temp3.index.repeat(df_temp3['Repeat']))
#We then create the admninistration time as a shifted version of the STARTTIME.
df_new3['charttime']=df_new3.groupby(level=0)['starttime'].transform(lambda x: pd.date_range(start=x.iat[0],freq=str(60*duration_split_hours)+'min',periods=len(x)))
#We divide each entry by the number of repeats
df_new3['amount']=df_new3['amount']/df_new3['Repeat']

In [17]:
df_temp2['charttime']=df_temp2['starttime']
df_temp4['charttime']=df_temp4['starttime']

In [18]:
#Eventually, we merge all 4splits into one.
inputs_small_4=df_new1.append([df_temp2,df_new3,df_temp4], sort=True)
#The result is a dataset with discrete inputs for each treatment.
inputs_small_4.shape

  inputs_small_4=df_new1.append([df_temp2,df_new3,df_temp4], sort=True)


(17637683, 14)

In [19]:
inputs_small_4.to_csv(path_data + '/processed/inputs_processed.csv')
inputs_small_4['hadm_id'].nunique()

52548