# Pre-processing of the outputevents dataset

In [1]:
import os
import pathlib

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 300)

In [3]:
adm=pd.read_csv(p_project + "/data/mimic3/processed/admissions_processed.csv")

We now consider the outputevents dataset. We select only the patients with the same criteria as above.

In [4]:
outputs=pd.read_csv(p_project + "/data/mimic3/OUTPUTEVENTS.csv.gz", compression='gzip')

In [5]:
#Some checks
assert(len(outputs.loc[outputs["ISERROR"].notnull()].index)==0) #No entry with iserror==TRUE

#Restrict the dataset to the previously selected admission ids only.
adm_ids=list(adm["HADM_ID"])
outputs=outputs.loc[outputs["HADM_ID"].isin(adm_ids)]

print("Number of patients remaining in the database: ")
print(outputs["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
24540


We load the D_ITEMS dataframe which contains the name of the ITEMID. And we merge both tables together.

In [6]:
#item_id 
item_id=pd.read_csv(p_project + "/data/mimic3/D_ITEMS.csv.gz", compression='gzip')

#We merge the name of the item administrated.
outputs_2=pd.merge(outputs, item_id[["ITEMID","LABEL"]], on="ITEMID")
outputs_2.head()
print("Number of patients remaining in the database: ")
print(outputs_2["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
24540


In [7]:
outputs_label_list=['Gastric Gastric Tube','Stool Out Stool','Urine Out Incontinent','Ultrafiltrate Ultrafiltrate','Foley', 'Void','Condom Cath','Fecal Bag','Ostomy (output)','Chest Tube #1','Chest Tube #2','Jackson Pratt #1','OR EBL','Pre-Admission','TF Residual']
outputs_bis=outputs_2.loc[outputs_2["LABEL"].isin(outputs_label_list)].copy()

print("Number of patients remaining in the database: ")
print(outputs_bis["SUBJECT_ID"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_bis.index))

outputs_3=outputs_bis.copy()

Number of patients remaining in the database: 
15964
Number of datapoints remaining in the database: 
781675


# Cleaning of the output data

### Units Cleaning

#### 1) Amounts

In [8]:
#Verification that all input labels have the same amounts units.
outputs_3.groupby("LABEL")["VALUEUOM"].value_counts() #OK

LABEL                        VALUEUOM
Chest Tube #1                mL           59614
Chest Tube #2                mL            7647
Condom Cath                  mL            2439
Fecal Bag                    mL            1278
Foley                        mL          603016
Gastric Gastric Tube         ml            1665
Jackson Pratt #1             mL            8487
OR EBL                       mL            1992
Ostomy (output)              mL            1579
Pre-Admission                mL            2759
Stool Out Stool              ml           30987
TF Residual                  mL           12181
Ultrafiltrate Ultrafiltrate  ml           18850
Urine Out Incontinent        ml             976
Void                         mL           27689
Name: VALUEUOM, dtype: int64

### Check for outliers

#### 1) In amounts

In [13]:
outputs_3.groupby("LABEL")["VALUE"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Chest Tube #1,59250.0,34.549377,35.458291,0.0,10.0,30.0,50.0,275.0
Chest Tube #2,7597.0,28.456628,36.330744,0.0,0.0,20.0,40.0,285.0
Condom Cath,2423.0,236.160957,201.437466,0.0,100.0,200.0,320.0,1100.0
Fecal Bag,1275.0,441.047843,329.053094,0.0,200.0,350.0,662.5,1600.0
Foley,603012.0,117.13908,117.433634,0.0,45.0,80.0,150.0,5400.0
Gastric Gastric Tube,1709.0,107.950848,128.437188,0.0,10.0,60.0,150.0,700.0
Jackson Pratt #1,8384.0,67.468356,75.721128,0.0,20.0,40.0,85.0,475.0
OR EBL,1962.0,543.196738,819.538233,0.0,100.0,250.0,600.0,5000.0
Ostomy (output),1567.0,196.530951,147.667173,0.0,100.0,175.0,275.0,850.0
Pre-Admission,2732.0,572.232796,601.981761,0.0,140.0,400.0,800.0,3600.0


In [10]:
#Remove all entries whose rate is more than 4 std away from the mean.
out_desc=outputs_3.groupby("LABEL")["VALUE"].describe()
name_list=list(out_desc.loc[out_desc["count"]!=0].index)
for label in name_list:
    outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]==label)&(outputs_3["VALUE"]>(out_desc.loc[label,"mean"]+4*out_desc.loc[label,"std"]))].index).copy()

print("Number of patients remaining in the database: ")
print(outputs_3["SUBJECT_ID"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_3.index))

Number of patients remaining in the database: 
15934
Number of datapoints remaining in the database: 
780550


In [11]:
#Clean Foley, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Foley") & (outputs_3["VALUE"]>5500)].index).copy()
#Clean Expected Blood Loss, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="OR EBL") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Out Expected Blood Loss, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="OR Out EBL") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean OR Urine, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="OR Urine") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Pre-Admission, remove too large and negative values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Pre-Admission") & (outputs_3["VALUE"]<0)].index).copy()
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Pre-Admission") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Pre-Admission output, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Pre-Admission Output Pre-Admission Output") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Urine Out Foley output, remove too large values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Urine Out Foley") & (outputs_3["VALUE"]>5000)].index).copy()
#Clean Void, remove negative values
outputs_3=outputs_3.drop(outputs_3.loc[(outputs_3["LABEL"]=="Void") & (outputs_3["VALUE"]<0)].index).copy()

outputs_3.dropna(subset=["VALUE"],inplace=True)

print("Number of patients remaining in the database: ")
print(outputs_3["SUBJECT_ID"].nunique())
print("Number of datapoints remaining in the database: ")
print(len(outputs_3.index))

Number of patients remaining in the database: 
12987
Number of datapoints remaining in the database: 
756217


As data is already in timestamp format, we don't neeed to consider rates

In [14]:
outputs_3.to_csv(p_project + "/data/mimic3/processed/outputs_processed.csv")