# MIMIC 4 data - dataset construction inputevents

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import numpy as np

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [3]:
adm = pd.read_csv("/path/admissions_processed.csv")

In [4]:
outputs = pd.read_csv('/path/outputevents.csv.gz')
outputs.tail()

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valueuom
4248823,19999068,21606769,31096823,2161-08-30 12:00:00,2161-08-30 12:21:00,226559,180.0,ml
4248824,19999068,21606769,31096823,2161-08-30 14:00:00,2161-08-30 13:49:00,226559,320.0,ml
4248825,19999068,21606769,31096823,2161-08-30 15:00:00,2161-08-30 14:49:00,226559,120.0,ml
4248826,19999068,21606769,31096823,2161-08-30 18:00:00,2161-08-30 17:50:00,226559,400.0,ml
4248827,19999068,21606769,31096823,2161-08-30 23:00:00,2161-08-30 23:35:00,226559,250.0,ml


In [5]:
#Some checks
# edit: tag not present here
#assert(len(outputs.loc[outputs["warning"].notnull()].index)==0) #No entry with iserror==TRUE

#Restrict the dataset to the previously selected admission ids only.
adm_ids=list(adm["hadm_id"])
outputs=outputs.loc[outputs["hadm_id"].isin(adm_ids)]

print("Number of patients remaining in the database: ")
print(outputs["subject_id"].nunique())

Number of patients remaining in the database: 
16428


In [6]:
#item_id 
item_id=pd.read_csv('/path/d_items.csv.gz')
item_id_1=item_id[["itemid","label"]]
item_id_1.head()

#We merge the name of the item administrated.
outputs_2=pd.merge(outputs,item_id_1,on="itemid")
outputs_2.head()
print("Number of patients remaining in the database: ")
print(outputs_2["subject_id"].nunique())

Number of patients remaining in the database: 
16428


In [7]:
n_best=15
#For each item, evaluate the number of patients who have been given this item.
pat_for_item=outputs_2.groupby("label")["subject_id"].nunique()
#Order by occurence and take the 20 best (the ones with the most patients)
frequent_labels=pat_for_item.sort_values(ascending=False)[:n_best]

#Select only the time series with high occurence.
outputs_3=outputs_2.loc[outputs_2["label"].isin(list(frequent_labels.index))].copy()

print("Number of patients remaining in the database: ")
print(outputs_3["subject_id"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_3.index))

print(frequent_labels)

Number of patients remaining in the database: 
16363
Number of datapoints remaining in the database: 
1090625
label
Foley                 14037
Void                   6261
OR Urine               5334
Chest Tube #1          4464
Oral Gastric           2868
Pre-Admission          2537
TF Residual            2333
OR EBL                 2185
Emesis                 1166
Nasogastric            1133
Stool                  1022
Jackson Pratt #1        922
TF Residual Output      759
Fecal Bag               711
Straight Cath           629
Name: subject_id, dtype: int64


In [8]:
outputs_label_list=['Foley', 'Void', 'OR Urine', 'Chest Tube', 'Oral Gastric', 'Pre-Admission', 'TF Residual', 'OR EBL', 'Emesis', 'Nasogastric', 'Stool', 'Jackson Pratt', 'TF Residual Output', 'Fecal Bag', 'Straight Cath']
#outputs_label_list=['Gastric Gastric Tube','Stool Out Stool','Urine Out Incontinent','Ultrafiltrate Ultrafiltrate','Foley', 'Void','Condom Cath','Fecal Bag','Ostomy (output)','Chest Tube #1','Chest Tube #2','Jackson Pratt #1','OR EBL','Pre-Admission','TF Residual']
outputs_bis=outputs_2.loc[outputs_2["label"].isin(outputs_label_list)].copy()

print("Number of patients remaining in the database: ")
print(outputs_bis["subject_id"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_bis.index))

outputs_3=outputs_bis.copy()

Number of patients remaining in the database: 
16358
Number of datapoints remaining in the database: 
974526


ODE-GRU-Bayes does data cleaning here.

In [9]:
#Verification that all input labels have the same amounts units.
outputs_3.groupby("label")["valueuom"].value_counts() #OK

label               valueuom
Emesis              ml            2040
Fecal Bag           ml            2891
Foley               ml          855690
Nasogastric         ml            5430
OR EBL              ml            2597
OR Urine            ml            6015
Oral Gastric        ml            7961
Pre-Admission       ml            3268
Stool               ml            2653
Straight Cath       ml            2036
TF Residual         ml           22594
TF Residual Output  ml            2337
Void                ml           59014
Name: valueuom, dtype: int64

In [10]:
outputs_3.to_csv("/path/outputs_processed.csv")