In [1]:
import os
import pathlib

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
from datetime import timedelta

import numpy as np

## INPUTS EVENTS DATA

We now consider the inputevents dataset. We select only the patients in the metavision system and with the same criteria as above.

In [3]:
inputs=pd.read_csv(p_project + "/data/pic/INPUTEVENTS.csv.gz", compression='gzip')

adm = pd.read_csv(p_project + "/temp/pic/processed/admissions_processed.csv")
#Restrict the dataset to the previously selected admission ids only.
adm_ids=list(adm["HADM_ID"])
inputs=inputs.loc[inputs["HADM_ID"].isin(adm_ids)]

In [None]:
# wait


In [None]:
#Inputs_small only contains the columns of interest.
inputs_small=inputs[["SUBJECT_ID","HADM_ID","STARTTIME","ENDTIME","ITEMID","AMOUNT","AMOUNTUOM","RATE","RATEUOM","PATIENTWEIGHT","ORDERCATEGORYDESCRIPTION"]]
print(inputs_small.head())

print("Number of patients remaining in the database: ")
print(inputs_small["SUBJECT_ID"].nunique())

In [4]:
#item_id 
item_id=pd.read_csv(p_project + "/data/pic/D_ITEMS.csv.gz", compression='gzip')
item_id_1=item_id[["ITEMID","LABEL"]]
item_id_1.head()

#We merge the name of the item administrated.
inputs_small_2=pd.merge(inputs_small,item_id_1,on="ITEMID")
inputs_small_2.head()
print("Number of patients remaining in the database: ")
print(inputs_small_2["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
10662


In [5]:
#For each item, evaluate the number of patients who have been given this item.
pat_for_item=inputs_small_2.groupby("LABEL")["SUBJECT_ID"].nunique()
#Order by occurence and take the 33 best (the ones with the most patients)
frequent_labels=pat_for_item.sort_values(ascending=False)[:50]

#Select only the time series with high occurence.
inputs_small_3=inputs_small_2.loc[inputs_small_2["LABEL"].isin(list(frequent_labels.index))].copy()

print("Number of patients remaining in the database: ")
print(inputs_small_3["SUBJECT_ID"].nunique())

Number of patients remaining in the database: 
10660


In [6]:
#Only select specific labels for the inputs.
#list of retained inputs :
retained_list=["Albumin 5%","Dextrose 5%","Lorazepam (Ativan)","Calcium Gluconate","Midazolam (Versed)","Phenylephrine","Furosemide (Lasix)","Hydralazine","Norepinephrine","Magnesium Sulfate","Nitroglycerin","Insulin - Glargine","Insulin - Humalog","Insulin - Regular","Heparin Sodium","Morphine Sulfate","Potassium Chloride","Packed Red Blood Cells","Gastric Meds","D5 1/2NS","LR","K Phos","Solution","Sterile Water","Metoprolol","Piggyback","OR Crystalloid Intake","OR Cell Saver Intake","PO Intake","GT Flush","KCL (Bolus)","Magnesium Sulfate (Bolus)"]
#missing :Fresh Frozen Plasma
inputs_small_3=inputs_small_3.loc[inputs_small_3["LABEL"].isin(retained_list)].copy()

In [7]:
inputs_small_3.shape

(981134, 12)

In [8]:
inputs_small_3.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,STARTTIME,ENDTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,RATEUOM,PATIENTWEIGHT,ORDERCATEGORYDESCRIPTION,LABEL
0,27063,139787,2133-02-05 06:29:00,2133-02-05 08:45:00,225166,6.774532,mEq,,,83.2,Continuous IV,Potassium Chloride
1,27063,139787,2133-02-05 05:34:00,2133-02-05 06:30:00,225166,2.8133,mEq,,,83.2,Continuous IV,Potassium Chloride
2,27063,139787,2133-02-05 05:34:00,2133-02-05 07:03:00,225166,4.433333,mEq,,,83.2,Continuous IV,Potassium Chloride
3,27063,139787,2133-02-05 09:43:00,2133-02-05 12:30:00,225166,8.360021,mEq,,,83.2,Continuous IV,Potassium Chloride
4,27063,139787,2133-02-05 05:34:00,2133-02-05 05:35:00,225166,0.05006,mEq,,,83.2,Continuous IV,Potassium Chloride


# Cleaning of the input data

### Units Cleaning

#### 1) Amounts

In [9]:
#Verification that all input labels have the same amounts units.
inputs_small_3.groupby("LABEL")["AMOUNTUOM"].value_counts()

LABEL                      AMOUNTUOM
Albumin 5%                 ml             4182
Calcium Gluconate          grams         12430
D5 1/2NS                   ml             8796
Dextrose 5%                ml           198060
Furosemide (Lasix)         mg            22982
GT Flush                   ml            26581
Gastric Meds               ml            39537
Heparin Sodium             units         15085
Hydralazine                mg             7660
Insulin - Glargine         units          4165
Insulin - Humalog          units         13139
Insulin - Regular          units         66267
K Phos                     mmol           3089
KCL (Bolus)                ml            21836
LR                         ml            30429
Lorazepam (Ativan)         mg             9771
Magnesium Sulfate          grams         13555
                           mg                1
Magnesium Sulfate (Bolus)  ml             9570
Metoprolol                 mg            16336
                       

In [10]:
##### Cleaning the Cefazolin (remove the ones that are not in dose unit)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["ITEMID"]==225850) & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Cefepime (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Cefepime") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Ceftriaxone (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Ceftriaxone") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Ciprofloxacin (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Ciprofloxacin") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Famotidine (Pepcid) (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Famotidine (Pepcid)") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Fentanyl (Concentrate) (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl (Concentrate)") & (inputs_small_3["AMOUNTUOM"]!="mg")].index).copy()
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl (Concentrate)") & (inputs_small_3["AMOUNTUOM"]=="mg"),"AMOUNT"]*=1000
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl (Concentrate)") & (inputs_small_3["AMOUNTUOM"]=="mg"),"AMOUNTUOM"]="mcg"
#Cleaning the Heparin Sodium (Prophylaxis) (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Heparin Sodium (Prophylaxis)") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Hydromorphone (Dilaudid) (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Hydromorphone (Dilaudid)") & (inputs_small_3["AMOUNTUOM"]!="mg")].index).copy()
#Cleaning the Magnesium Sulfate (remove the non grams)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Magnesium Sulfate") & (inputs_small_3["AMOUNTUOM"]!="grams")].index).copy()
#Cleaning the Propofol (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Propofol") & (inputs_small_3["AMOUNTUOM"]!="mg")].index).copy()
#Cleaning the Metoprolol (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Metoprolol") & (inputs_small_3["AMOUNTUOM"]!="mg")].index).copy()
#Cleaning the Piperacillin/Tazobactam (Zosyn) (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Piperacillin/Tazobactam (Zosyn)") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Metronidazole (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Metronidazole") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Ranitidine (Prophylaxis)(remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Ranitidine (Prophylaxis)") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Vancomycin (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Vancomycin") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()
#Cleaning the Fentanyl. Put the mg to mcg 
inputs_small_3.loc[(inputs_small_3["ITEMID"]==221744) & (inputs_small_3["AMOUNTUOM"]=="mg"),"AMOUNT"]*=1000
inputs_small_3.loc[(inputs_small_3["ITEMID"]==221744) & (inputs_small_3["AMOUNTUOM"]=="mg"),"AMOUNTUOM"]="mcg"
#Cleaning of the Pantoprazole (Protonix)
    #divide in two (drug shot or continuous treatment and create a new item id for the continuous version)
inputs_small_3.loc[(inputs_small_3["ITEMID"]==225910) & (inputs_small_3["ORDERCATEGORYDESCRIPTION"]=="Continuous Med"),"LABEL"]="Pantoprazole (Protonix) Continuous"
inputs_small_3.loc[(inputs_small_3["ITEMID"]==225910) & (inputs_small_3["ORDERCATEGORYDESCRIPTION"]=="Continuous Med"),"ITEMID"]=2217441
#remove the non dose from the drug shot version
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Pantoprazole (Protonix)") & (inputs_small_3["AMOUNTUOM"]!="dose")].index).copy()


#Verification that all input labels have the same units.
inputs_small_3.groupby("LABEL")["AMOUNTUOM"].value_counts()

LABEL                      AMOUNTUOM
Albumin 5%                 ml             4182
Calcium Gluconate          grams         12430
D5 1/2NS                   ml             8796
Dextrose 5%                ml           198060
Furosemide (Lasix)         mg            22982
GT Flush                   ml            26581
Gastric Meds               ml            39537
Heparin Sodium             units         15085
Hydralazine                mg             7660
Insulin - Glargine         units          4165
Insulin - Humalog          units         13139
Insulin - Regular          units         66267
K Phos                     mmol           3089
KCL (Bolus)                ml            21836
LR                         ml            30429
Lorazepam (Ativan)         mg             9771
Magnesium Sulfate          grams         13555
Magnesium Sulfate (Bolus)  ml             9570
Metoprolol                 mg            16336
Midazolam (Versed)         mg            34560
Morphine Sulfate       

#### 2) Rates

In [91]:
#inputs_small_3.groupby("LABEL")["RATEUOM"].value_counts()

In [11]:
#Cleaning of Dextrose 5%  (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Dextrose 5%") & (inputs_small_3["RATEUOM"]!="mL/hour")].index).copy()
#Cleaning of Magnesium Sulfate (Bolus)  (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Magnesium Sulfate (Bolus)") & (inputs_small_3["RATEUOM"]!="mL/hour")].index).copy()
#Cleaning of NaCl 0.9% (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="NaCl 0.9%") & (inputs_small_3["RATEUOM"]!="mL/hour")].index).copy()
#Cleaning of Piggyback (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Piggyback") & (inputs_small_3["RATEUOM"]!="mL/hour")].index).copy()
#Cleaning of Packed Red Bllod Cells (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Packed Red Blood Cells") & (inputs_small_3["RATEUOM"]!="mL/hour")].index).copy()

#Check if a single unit per drug
inputs_small_3.groupby("LABEL")["RATEUOM"].value_counts()

LABEL                      RATEUOM   
Albumin 5%                 mL/hour         4182
D5 1/2NS                   mL/hour         8770
Dextrose 5%                mL/hour       110219
Furosemide (Lasix)         mg/hour         9268
Heparin Sodium             units/hour     13915
Hydralazine                mg/hour            2
Insulin - Regular          units/hour     30809
KCL (Bolus)                mL/hour        21836
LR                         mL/hour        19250
Lorazepam (Ativan)         mg/hour          106
Magnesium Sulfate (Bolus)  mL/hour         9568
Midazolam (Versed)         mg/hour        19816
Morphine Sulfate           mg/hour         2119
Nitroglycerin              mcg/kg/min     22258
Norepinephrine             mcg/kg/min     36899
Packed Red Blood Cells     mL/hour        10853
Phenylephrine              mcg/kg/min     47977
Piggyback                  mL/hour        18481
Solution                   mL/hour       117605
Sterile Water              mL/hour        11799
Na

### Check for outliers

#### 1) In amounts

In [12]:
inputs_small_3.groupby("LABEL")["AMOUNT"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Albumin 5%,4182.0,387.179434,165.277513,-149.999994,249.99999,499.99998,499.99998,5199.999792
Calcium Gluconate,12430.0,2.398595,3.459555,-0.933333,2.0,2.0,2.0,199.999998
D5 1/2NS,8796.0,634.169208,438.431068,-918.173907,199.999992,774.867035,999.999968,5999.999904
Dextrose 5%,110219.0,115.602237,192.446352,-5250.0,6.550234,47.301135,152.000009,5000.00004
Furosemide (Lasix),22982.0,35.351467,43.844288,-120.000009,15.333333,20.000001,40.000003,2320.0002
GT Flush,26581.0,48.73466,63.42406,-3750.0,30.0,30.0,50.0,2000.0
Gastric Meds,39537.0,66.559448,1212.065824,-5400.0,30.0,40.0,60.0,240120.0
Heparin Sodium,15085.0,9486.883495,9717.438338,-525000.0,2138.730944,6800.0,15483.14061,30640.0003
Hydralazine,7660.0,11.845484,27.463239,-240.000012,10.000001,10.000001,10.000001,2350.0001
Insulin - Glargine,4165.0,23.37527,28.041081,-180.0,10.0,20.0,30.0,1515.0


In [13]:
#Clean Albumin 5%
#Invert start time and end time
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Albumin 5%") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Albumin 5%") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Albumin 5%") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Albumin 5%") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]=a
#Positive rate
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Albumin 5%") & (inputs_small_3["AMOUNT"]<0),"AMOUNT"]*=-1

#Clean Calcium Gluconate
#Invert start time and end time
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Calcium Gluconate") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Calcium Gluconate") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Calcium Gluconate") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Calcium Gluconate") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]=a
#Positive rate
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Calcium Gluconate") & (inputs_small_3["AMOUNT"]<0),"AMOUNT"]*=-1
#Remove entries with more 10 grams.
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Calcium Gluconate") & (inputs_small_3["AMOUNT"]>10)].index).copy()


#Clean Cefazolin
#Invert start time and end time
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Cefazolin") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Cefazolin") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Cefazolin") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Cefazolin") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]=a
#Positive rate
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Cefazolin") & (inputs_small_3["AMOUNT"]<0),"AMOUNT"]*=-1
#Remove entries with more than 2 doses amount.
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Cefazolin") & (inputs_small_3["AMOUNT"]>2)].index).copy()

#Clean Cefepime
#Remove the negative entries (they are anyway too large in the positive range as well.)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Cefepime") & (inputs_small_3["AMOUNT"]<0)].index).copy()

#Clean Ceftriaxone
#Remove the negative entries (they are anyway too large in the positive range as well.)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Ceftriaxone") & (inputs_small_3["AMOUNT"]<0)].index).copy()

#Clean Ciprofloxacin
#Remove the negative entries (they are anyway too large in the positive range as well.)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Ciprofloxacin") & (inputs_small_3["AMOUNT"]<0)].index).copy()

#Clean D5 1/2NS
#Invert start time and end time
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="D5 1/2NS") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]
inputs_small_3.loc[(inputs_small_3["LABEL"]=="D5 1/2NS") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]=inputs_small_3.loc[(inputs_small_3["LABEL"]=="D5 1/2NS") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="D5 1/2NS") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]=a
#Positive rate
inputs_small_3.loc[(inputs_small_3["LABEL"]=="D5 1/2NS") & (inputs_small_3["AMOUNT"]<0),"AMOUNT"]*=-1

#Clean Dextrose 5%
#Invert start time and end time
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Dextrose 5%") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Dextrose 5%") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Dextrose 5%") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Dextrose 5%") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]=a
#Positive rate
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Dextrose 5%") & (inputs_small_3["AMOUNT"]<0),"AMOUNT"]*=-1

#Clean Famotidine (Pepcid)
#Remove the negative entries (they are anyway too large in the positive range as well.)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Famotidine (Pepcid)") & (inputs_small_3["AMOUNT"]<0)].index).copy()
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Famotidine (Pepcid)") & (inputs_small_3["AMOUNT"]>1)].index).copy()

#Clean Fentanyl
#Invert start time and end time
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]=a
#Positive rate
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl") & (inputs_small_3["AMOUNT"]<0),"AMOUNT"]*=-1

#Clean Fentanyl (Concentrate)
#Invert start time and end time
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl (Concentrate)") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl (Concentrate)") & (inputs_small_3["AMOUNT"]<0),"STARTTIME"]=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl (Concentrate)") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]
a=inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl (Concentrate)") & (inputs_small_3["AMOUNT"]<0),"ENDTIME"]=a
#Positive rate
inputs_small_3.loc[(inputs_small_3["LABEL"]=="Fentanyl (Concentrate)") & (inputs_small_3["AMOUNT"]<0),"AMOUNT"]*=-1


#Check if all remaining negative values are linked to the swapping the start and end times.
inputs_small_3['STARTTIME']=pd.to_datetime(inputs_small_3["STARTTIME"], format='%Y-%m-%d %H:%M:%S')
inputs_small_3['ENDTIME']=pd.to_datetime(inputs_small_3["ENDTIME"], format='%Y-%m-%d %H:%M:%S')
inputs_small_3["DURATION"]=inputs_small_3['ENDTIME']-inputs_small_3['STARTTIME']
print(inputs_small_3.loc[(inputs_small_3["AMOUNT"]<0)&(inputs_small_3["DURATION"]>timedelta(0))]) #All are inverted

#Revert all the remaining negative values to the positive range.
a=inputs_small_3.loc[inputs_small_3["AMOUNT"]<0,"STARTTIME"]
inputs_small_3.loc[inputs_small_3["AMOUNT"]<0,"STARTTIME"]=inputs_small_3.loc[inputs_small_3["AMOUNT"]<0,"ENDTIME"]
a=inputs_small_3.loc[inputs_small_3["AMOUNT"]<0,"ENDTIME"]=a
#Positive rate
inputs_small_3.loc[inputs_small_3["AMOUNT"]<0,"AMOUNT"]*=-1

#Recompute the durations with the correct time stamps for start and end.
inputs_small_3["DURATION"]=inputs_small_3['ENDTIME']-inputs_small_3['STARTTIME']

#Clean Gastric Meds, remove too large values
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Gastric Meds") & (inputs_small_3["AMOUNT"]>5000)].index).copy()

#Clean Heparin Sodium, remove too large values
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Heparin Sodium") & (inputs_small_3["AMOUNT"]>50000)].index).copy()

#Clean Heparin Sodium (Prophylaxis), remove too large values
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Heparin Sodium (Prophylaxis)") & (inputs_small_3["AMOUNT"]>2)].index).copy()

#Clean Hydralazine, remove too large values
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Hydralazine") & (inputs_small_3["AMOUNT"]>200)].index).copy()

#Clean Hydromorphone (Dilaudid), remove too large values
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Hydromorphone (Dilaudid)") & (inputs_small_3["AMOUNT"]>500)].index).copy()

#Clean Insulin - Humalog, remove too large values
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Insulin - Humalog") & (inputs_small_3["AMOUNT"]>100)].index).copy()

#Clean Insulin - Regular, remove too large values
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Insulin - Regular") & (inputs_small_3["AMOUNT"]>1000)].index).copy()

#Clean Magnesium Sulfate, remove too large values
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="Magnesium Sulfate") & (inputs_small_3["AMOUNT"]>51)].index).copy()

#To be continued ...

Empty DataFrame
Columns: [SUBJECT_ID, HADM_ID, STARTTIME, ENDTIME, ITEMID, AMOUNT, AMOUNTUOM, RATE, RATEUOM, PATIENTWEIGHT, ORDERCATEGORYDESCRIPTION, LABEL, DURATION]
Index: []


#### 2) In rates

In [14]:
inputs_small_3.groupby("LABEL")["RATE"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Albumin 5%,4182.0,466.637548,251.462363,25.0,249.99999,499.99998,499.99998,4285.714
Calcium Gluconate,0.0,,,,,,,
D5 1/2NS,8770.0,107.185895,661.375483,1.006036,75.0,80.000004,100.334454,59598.0
Dextrose 5%,110219.0,44.081607,334.585362,0.0,9.505703,14.883734,30.061585,25375.0
Furosemide (Lasix),9268.0,9.17358,10.607397,0.05,4.951088,7.997161,12.041118,635.781
GT Flush,0.0,,,,,,,
Gastric Meds,0.0,,,,,,,
Heparin Sodium,13913.0,1973.890062,16970.158028,10.500053,900.36012,1200.73896,1650.46782,1500000.0
Hydralazine,2.0,45.000004,7.071069,40.000003,42.500004,45.000004,47.500004,50.0
Insulin - Glargine,0.0,,,,,,,


In [17]:
#Clean D5 1/2NS Remove too large rates
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]=="D5 1/2NS") & (inputs_small_3["RATE"]>1000)].index).copy()

#Remove all entries whose rate is more than 4 std away from the mean.
rate_desc=inputs_small_3.groupby("LABEL")["RATE"].describe()
name_list=list(rate_desc.loc[rate_desc["count"]!=0].index)
for label in name_list:
    inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["LABEL"]==label)&(inputs_small_3["RATE"]>(rate_desc.loc[label,"mean"]+4*rate_desc.loc[label,"std"]))].index).copy()

In [18]:
inputs_small_3.groupby("LABEL")["RATE"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Albumin 5%,4172.0,462.792405,236.223481,25.0,249.99999,499.99998,499.99998,1250.00004
Calcium Gluconate,0.0,,,,,,,
D5 1/2NS,8610.0,88.041332,42.823404,1.006036,75.0,80.000004,100.135691,272.7273
Dextrose 5%,109016.0,28.319623,40.189935,0.0,9.448495,14.625854,29.999996,340.90914
Furosemide (Lasix),9234.0,8.871685,5.78749,0.05,4.904625,7.995871,12.024048,30.839868
GT Flush,0.0,,,,,,,
Gastric Meds,0.0,,,,,,,
Heparin Sodium,13840.0,1360.301191,706.933933,10.500053,900.36012,1200.45879,1650.141,6881.607
Hydralazine,2.0,45.000004,7.071069,40.000003,42.500004,45.000004,47.500004,50.000005
Insulin - Glargine,0.0,,,,,,,


## We now split the entries which are spread in time.

We chose the duration window for the sampling. here we choose 30 minutes. So every entry which has a rate and with duration larger than 1 hour, we split it into fixed times injections.



In [19]:
#First we check if when there is a duration, the amount is matching.

#First check the /hours units
df_temp=inputs_small_3.loc[(inputs_small_3["RATE"].notnull()) & (inputs_small_3["RATEUOM"].str.contains("hour"))].copy()
df_temp["COMPUTED_AMOUNT"]=df_temp["RATE"]*(df_temp["DURATION"].dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["COMPUTED_AMOUNT"]-df_temp["AMOUNT"])>0.01)].index)==0) #OK

#Second check the /min units
df_temp=inputs_small_3.loc[(inputs_small_3["RATE"].notnull()) & (inputs_small_3["RATEUOM"].str.contains("mL/min"))].copy()
df_temp["COMPUTED_AMOUNT"]=df_temp["RATE"]*(df_temp["DURATION"].dt.total_seconds()/60)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["COMPUTED_AMOUNT"]-df_temp["AMOUNT"])>0.01)].index)==0) #OK

#Third check the kg/min units
df_temp=inputs_small_3.loc[(inputs_small_3["RATE"].notnull()) & (inputs_small_3["RATEUOM"].str.contains("kg/min"))].copy()
df_temp["COMPUTED_AMOUNT"]=df_temp["RATE"]*(df_temp["DURATION"].dt.total_seconds()/60)*(df_temp["PATIENTWEIGHT"])

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["COMPUTED_AMOUNT"]-1000*df_temp["AMOUNT"])>0.01)].index)==0) #OK

In [20]:
duration_split_hours=0.5
to_sec_fact=3600*duration_split_hours

#split data set in four.

#The first dataframe contains the entries with no rate but with extended duration inputs (over 0.5 hour)
df_temp1=inputs_small_3.loc[(inputs_small_3["DURATION"]>timedelta(hours=duration_split_hours)) & (inputs_small_3["RATE"].isnull())].copy().reset_index(drop=True)
#The second dataframe contains the entries with no rate and low duration entries (<0.5hour)
df_temp2=inputs_small_3.loc[(inputs_small_3["DURATION"]<=timedelta(hours=duration_split_hours)) & (inputs_small_3["RATE"].isnull())].copy().reset_index(drop=True)
#The third dataframe contains the entries with a rate and extended duration inputs (over 0.5 hour)
df_temp3=inputs_small_3.loc[(inputs_small_3["DURATION"]>timedelta(hours=duration_split_hours)) & (inputs_small_3["RATE"].notnull())].copy().reset_index(drop=True)
#The forth dataframe contains the entries with a rate and low duration entries (< 0.5 hour)
df_temp4=inputs_small_3.loc[(inputs_small_3["DURATION"]<=timedelta(hours=duration_split_hours)) & (inputs_small_3["RATE"].notnull())].copy().reset_index(drop=True)

#Check if split is complete
assert(len(df_temp1.index)+len(df_temp2.index)+len(df_temp3.index)+len(df_temp4.index)==len(inputs_small_3.index))



In [21]:
#We then process all of these dfs.
#In the first one, we need to duplicate the entries according to their duration and then divide each entry by the number of duplicates

#We duplicate the rows with the number bins for each injection
df_temp1["Repeat"]=np.ceil(df_temp1["DURATION"].dt.total_seconds()/to_sec_fact).astype(int)
df_new1=df_temp1.reindex(df_temp1.index.repeat(df_temp1["Repeat"]))
#We then create the admninistration time as a shifted version of the STARTTIME.
df_new1["CHARTTIME"]=df_new1.groupby(level=0)['STARTTIME'].transform(lambda x: pd.date_range(start=x.iat[0],freq=str(60*duration_split_hours)+'min',periods=len(x)))
#We divide each entry by the number of repeats
df_new1["AMOUNT"]=df_new1["AMOUNT"]/df_new1["Repeat"]


# In the third one, we do the same
#We duplicate the rows with the number bins for each injection
df_temp3["Repeat"]=np.ceil(df_temp3["DURATION"].dt.total_seconds()/to_sec_fact).astype(int)
df_new3=df_temp3.reindex(df_temp3.index.repeat(df_temp3["Repeat"]))
#We then create the admninistration time as a shifted version of the STARTTIME.
df_new3["CHARTTIME"]=df_new3.groupby(level=0)['STARTTIME'].transform(lambda x: pd.date_range(start=x.iat[0],freq=str(60*duration_split_hours)+'min',periods=len(x)))
#We divide each entry by the number of repeats
df_new3["AMOUNT"]=df_new3["AMOUNT"]/df_new3["Repeat"]

df_temp2["CHARTTIME"]=df_temp2["STARTTIME"]
df_temp4["CHARTTIME"]=df_temp4["STARTTIME"]

In [22]:
#Eventually, we merge all 4splits into one.
inputs_small_4=df_new1.append([df_temp2,df_new3,df_temp4])
#The result is a dataset with discrete inputs for each treatment.

In [23]:
inputs_small_4.groupby("LABEL")["AMOUNT"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Albumin 5%,8249.0,195.769313,96.89079,0.0,124.999995,241.699984,249.99999,500.0
Calcium Gluconate,27712.0,0.901951,0.301038,0.0,1.0,1.0,1.0,9.007998
D5 1/2NS,148584.0,36.8429,21.324177,0.0,24.999999,37.037037,49.914999,1000.0
Dextrose 5%,1321254.0,9.29763,13.557502,0.0,3.721316,5.0,9.006601,167.449629
Furosemide (Lasix),120596.0,6.726208,13.873005,0.0,1.960814,3.449903,7.122678,2320.0002
GT Flush,26581.0,49.655619,62.705619,0.0,30.0,30.0,50.0,3750.0
Gastric Meds,39534.0,61.71652,97.514384,0.0,30.0,40.0,60.0,4560.0
Heparin Sodium,244441.0,584.500497,406.945899,0.0,377.777768,522.292226,712.238737,30000.0
Hydralazine,7658.0,11.682738,5.434583,0.0,10.000001,10.000001,10.000001,160.000016
Insulin - Glargine,4165.0,23.518848,27.920739,0.0,10.0,20.0,30.0,1515.0


In [24]:
#Again, we remove all the observations that are more than 5std away from the mean.

amount_desc=inputs_small_4.groupby("LABEL")["AMOUNT"].describe()
name_list=list(amount_desc.loc[amount_desc["count"]!=0].index)
for label in name_list:
    inputs_small_4=inputs_small_4.drop(inputs_small_4.loc[(inputs_small_4["LABEL"]==label)&(inputs_small_4["AMOUNT"]>(amount_desc.loc[label,"mean"]+5*amount_desc.loc[label,"std"]))].index).copy()


In [25]:
inputs_small_4.groupby("LABEL")["AMOUNT"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Albumin 5%,8243.0,195.608018,96.612783,0.0,124.999995,237.499987,249.99999,500.0
Calcium Gluconate,26668.0,0.89863,0.296948,0.0,1.0,1.0,1.0,2.0
D5 1/2NS,148397.0,36.771174,19.995412,0.0,24.999999,37.037037,49.914999,130.000776
Dextrose 5%,1209023.0,8.576186,10.786395,0.0,3.692554,5.0,8.62069,77.065923
Furosemide (Lasix),118560.0,5.951939,7.740617,0.0,1.955556,3.439757,7.055556,60.000002
GT Flush,25747.0,47.882083,48.711006,0.0,30.0,30.0,50.0,360.0
Gastric Meds,38494.0,57.033717,51.108332,0.0,30.0,40.0,60.0,520.0
Heparin Sodium,243359.0,572.901668,323.725808,0.0,376.672167,521.485166,711.116176,2600.0
Hydralazine,7581.0,11.490754,4.396403,0.0,10.000001,10.000001,10.000001,30.000001
Insulin - Glargine,4161.0,23.105263,15.449545,0.0,10.0,20.0,30.0,150.0


In [27]:
inputs_small_4.to_csv(p_project + "/temp/pic/processed/inputs_processed.csv")