# MIMIC 4 data - dataset construction inputevents

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import numpy as np

In [2]:
adm_3 = pd.read_csv("/path/admissions_processed.csv")

In [3]:
inputs=pd.read_csv('/path/inputevents.csv.gz')
#Restrict the dataset to the previously selected admission ids only.
adm_ids=list(adm_3["hadm_id"])
inputs=inputs.loc[inputs["hadm_id"].isin(adm_ids)]
inputs.tail()

Unnamed: 0,subject_id,hadm_id,stay_id,starttime,endtime,storetime,itemid,amount,amountuom,rate,...,ordercategorydescription,patientweight,totalamount,totalamountuom,isopenbag,continueinnextdept,cancelreason,statusdescription,originalamount,originalrate
8868265,18648460,23908012,35411347,2110-03-31 10:49:00,2110-03-31 10:50:00,2110-03-31 10:50:00,229616,4.0,mg,,...,Drug Push,58.1,,,0,0,0,FinishedRunning,4.0,4.0
8868266,18648460,23908012,35411347,2110-04-02 22:15:00,2110-04-02 22:16:00,2110-04-02 22:15:00,223260,16.0,units,,...,Drug Push,58.1,,,0,0,0,FinishedRunning,16.0,16.0
8868267,18648460,23908012,35411347,2110-04-02 22:31:00,2110-04-02 23:01:00,2110-04-02 22:31:00,222315,1.2,units,2.4,...,Continuous Med,58.1,100.0,ml,0,0,0,Stopped,40.0,2.4
8868268,18648460,23908012,35411347,2110-04-02 22:31:00,2110-04-02 23:01:00,2110-04-02 22:31:00,220949,3.0,ml,6.0,...,Continuous Med,58.1,100.0,ml,0,0,0,Stopped,100.0,6.0
8868269,18648460,23908012,35411347,2110-04-02 22:36:00,2110-04-02 22:37:00,2110-04-02 22:37:00,225158,500.0,ml,,...,Bolus,58.1,500.0,ml,0,0,0,FinishedRunning,500.0,500.0


In [4]:
#Inputs_small only contains the columns of interest.
inputs_small=inputs[["subject_id","hadm_id","starttime","endtime","itemid","amount","amountuom","rate","rateuom","patientweight","ordercategorydescription"]]
print(inputs_small.head())

print("Number of patients remaining in the database: ")
print(inputs_small["subject_id"].nunique())

     subject_id   hadm_id            starttime              endtime  itemid  \
295    13859862  25015072  2152-04-10 12:40:00  2152-04-10 12:41:00  221385   
359    13859862  25015072  2152-04-07 08:00:00  2152-04-07 08:01:00  225975   
360    13859862  25015072  2152-04-09 12:17:00  2152-04-09 12:18:00  225884   
361    13859862  25015072  2152-04-09 12:17:00  2152-04-09 12:18:00  220949   
362    13859862  25015072  2152-04-08 22:21:00  2152-04-09 03:34:00  225828   

         amount amountuom  rate  rateuom  patientweight  \
295    1.000000        mg   NaN      NaN           58.0   
359    1.000000      dose   NaN      NaN           58.0   
360    1.000000      dose   NaN      NaN           58.0   
361  100.000000        ml   NaN      NaN           58.0   
362   52.166668        ml  10.0  mL/hour           58.0   

    ordercategorydescription  
295                Drug Push  
359                Drug Push  
360                Drug Push  
361                Drug Push  
362            

In [5]:
#item_id 
item_id=pd.read_csv('/path/d_items.csv.gz')
item_id_1=item_id[["itemid","label"]]
item_id_1.head()

#We merge the name of the item administrated.
inputs_small_2=pd.merge(inputs_small,item_id_1,on="itemid")
inputs_small_2.head()
print("Number of patients remaining in the database: ")
print(inputs_small_2["subject_id"].nunique())

Number of patients remaining in the database: 
16550


In [6]:
#For each item, evaluate the number of patients who have been given this item.
pat_for_item=inputs_small_2.groupby("label")["subject_id"].nunique()
#Order by occurence and take the 33 best (the ones with the most patients)
frequent_labels=pat_for_item.sort_values(ascending=False)[:50]

#Select only the time series with high occurence.
inputs_small_3=inputs_small_2.loc[inputs_small_2["label"].isin(list(frequent_labels.index))].copy()

print("Number of patients remaining in the database: ")
print(inputs_small_3["subject_id"].nunique())

Number of patients remaining in the database: 
16548


In [7]:
#Verification that all input labels have the same amounts units.
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(inputs_small_3.groupby("label")["amountuom"].value_counts())

label                            amountuom
Acetaminophen-IV                 mg            21016
                                 grams            16
                                 dose              1
Albumin 5%                       ml             8270
Calcium Gluconate                grams         19072
Cefazolin                        dose          14850
                                 grams           374
                                 mg               14
Cefepime                         dose          13931
                                 grams           474
                                 mg                4
Ceftriaxone                      dose           4573
                                 grams            96
                                 mg                2
D5 1/2NS                         ml             6685
                                 L                 3
Dexmedetomidine (Precedex)       mcg           28011
                                 mg             1344
Dex

In [8]:
##### Cleaning the Cefazolin (remove the ones that are not in dose unit)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["itemid"]==225850) & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Cefepime (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Cefepime") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Ceftriaxone (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Ceftriaxone") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Ciprofloxacin (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Ciprofloxacin") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Famotidine (Pepcid) (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Famotidine (Pepcid)") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Fentanyl (Concentrate) (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Fentanyl (Concentrate)") & (inputs_small_3["amountuom"]!="mg")].index).copy()
inputs_small_3.loc[(inputs_small_3["label"]=="Fentanyl (Concentrate)") & (inputs_small_3["amountuom"]=="mg"),"amount"]*=1000
inputs_small_3.loc[(inputs_small_3["label"]=="Fentanyl (Concentrate)") & (inputs_small_3["amountuom"]=="mg"),"amountuom"]="mcg"
#Cleaning the Heparin Sodium (Prophylaxis) (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Heparin Sodium (Prophylaxis)") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Hydromorphone (Dilaudid) (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Hydromorphone (Dilaudid)") & (inputs_small_3["amountuom"]!="mg")].index).copy()
#Cleaning the Magnesium Sulfate (remove the non grams)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Magnesium Sulfate") & (inputs_small_3["amountuom"]!="grams")].index).copy()
#Cleaning the Propofol (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Propofol") & (inputs_small_3["amountuom"]!="mg")].index).copy()
#Cleaning the Metoprolol (remove the non mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Metoprolol") & (inputs_small_3["amountuom"]!="mg")].index).copy()
#Cleaning the Piperacillin/Tazobactam (Zosyn) (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Piperacillin/Tazobactam (Zosyn)") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Metronidazole (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Metronidazole") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Ranitidine (Prophylaxis)(remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Ranitidine (Prophylaxis)") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Vancomycin (remove the non dose)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Vancomycin") & (inputs_small_3["amountuom"]!="dose")].index).copy()
#Cleaning the Fentanyl. Put the mg to mcg 
inputs_small_3.loc[(inputs_small_3["itemid"]==221744) & (inputs_small_3["amountuom"]=="mg"),"amount"]*=1000
inputs_small_3.loc[(inputs_small_3["itemid"]==221744) & (inputs_small_3["amountuom"]=="mg"),"amountuom"]="mcg"
#Cleaning of the Pantoprazole (Protonix)
    #divide in two (drug shot or continuous treatment and create a new item id for the continuous version)
inputs_small_3.loc[(inputs_small_3["itemid"]==225910) & (inputs_small_3["ordercategorydescription"]=="Continuous Med"),"label"]="Pantoprazole (Protonix) Continuous"
inputs_small_3.loc[(inputs_small_3["itemid"]==225910) & (inputs_small_3["ordercategorydescription"]=="Continuous Med"),"itemid"]=2217441
#remove the non dose from the drug shot version
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Pantoprazole (Protonix)") & (inputs_small_3["amountuom"]!="dose")].index).copy()

In [9]:
# preprocessing added for mimic4
#Cleaning the Acetaminophen-IV (keep mg)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Acetaminophen-IV") & (inputs_small_3["amountuom"]!="mg")].index).copy()

#Cleaning the D5 1/2NS (keep ml)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="D5 1/2NS") & (inputs_small_3["amountuom"]!="ml")].index).copy()

#Cleaning the Dexmedetomidine (Precedex) (cast all to mg)
inputs_small_3.loc[(inputs_small_3["label"]=="Dexmedetomidine (Precedex)") & (inputs_small_3["amountuom"]=="mcg"),"amount"]/=1000
inputs_small_3.loc[(inputs_small_3["label"]=="Dexmedetomidine (Precedex)") & (inputs_small_3["amountuom"]=="mcg"),"amountuom"]="mg"

#Cleaning the LR
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="LR") & (inputs_small_3["amountuom"]!="ml")].index).copy()

#Cleaning the NaCl 0.9%
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="NaCl 0.9%") & (inputs_small_3["amountuom"]!="ml")].index).copy()

#Cleaning the OR Crystalloid Intake 
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="OR Crystalloid Intake") & (inputs_small_3["amountuom"]!="ml")].index).copy()

#Cleaning the PO Intake
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="PO Intake") & (inputs_small_3["amountuom"]!="ml")].index).copy()

#Cleaning the Pre-Admission/Non-ICU Intake 
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Pre-Admission/Non-ICU Intake") & (inputs_small_3["amountuom"]!="ml")].index).copy()



In [10]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(inputs_small_3.groupby("label")["amountuom"].value_counts())

label                               amountuom
Acetaminophen-IV                    mg            21016
Albumin 5%                          ml             8270
Calcium Gluconate                   grams         19072
Cefazolin                           dose          14850
Cefepime                            dose          13931
Ceftriaxone                         dose           4573
D5 1/2NS                            ml             6685
Dexmedetomidine (Precedex)          mg            29355
Dextrose 5%                         ml           282270
Famotidine (Pepcid)                 dose          17987
Fentanyl                            mcg           60182
Fentanyl (Concentrate)              mcg           31875
Free Water                          ml            14587
Furosemide (Lasix)                  mg            27427
GT Flush                            ml            50947
Gastric Meds                        ml            67944
Heparin Sodium                      units         18377
He

In [11]:
inputs_small_3.groupby("label")["rateuom"].value_counts()

label                               rateuom    
Acetaminophen-IV                    mg/min           7680
                                    mg/hour             1
Albumin 5%                          mL/hour          8270
D5 1/2NS                            mL/hour          6649
Dexmedetomidine (Precedex)          mcg/kg/hour     29355
Dextrose 5%                         mL/hour        177145
Fentanyl                            mcg/hour         3186
Fentanyl (Concentrate)              mcg/hour        31874
                                    mcg/kg/hour         1
Furosemide (Lasix)                  mg/hour          4938
Heparin Sodium                      units/hour      16426
Hydralazine                         mg/hour             1
Hydromorphone (Dilaudid)            mg/hour          2021
Insulin - Regular                   units/hour      43150
KCL (Bolus)                         mL/hour         31499
LR                                  mL/hour         17502
Lorazepam (Ativan)      

In [12]:
#Cleaning of Dextrose 5%  (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Dextrose 5%") & (inputs_small_3["rateuom"]!="mL/hour")].index).copy()
#Cleaning of Magnesium Sulfate (Bolus)  (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Magnesium Sulfate (Bolus)") & (inputs_small_3["rateuom"]!="mL/hour")].index).copy()
#Cleaning of NaCl 0.9% (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="NaCl 0.9%") & (inputs_small_3["rateuom"]!="mL/hour")].index).copy()
#Cleaning of Piggyback (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Piggyback") & (inputs_small_3["rateuom"]!="mL/hour")].index).copy()
#Cleaning of Packed Red Bllod Cells (remove the non mL/hour)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Packed Red Blood Cells") & (inputs_small_3["rateuom"]!="mL/hour")].index).copy()

# additional cleaning for mimic4
#Cleaning of Acetaminophen-IV
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Acetaminophen-IV") & (inputs_small_3["rateuom"]!="mg/min")].index).copy()

#Cleaning of Fentanyl (Concentrate)
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Fentanyl (Concentrate)") & (inputs_small_3["rateuom"]!="mcg/hour")].index).copy()

#Cleaning of Phenylephrine
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Phenylephrine") & (inputs_small_3["rateuom"]!="mcg/kg/min")].index).copy()

#Cleaning of Sterile Water 
inputs_small_3=inputs_small_3.drop(inputs_small_3.loc[(inputs_small_3["label"]=="Sterile Water") & (inputs_small_3["rateuom"]!="mL/hour")].index).copy()


#Check if a single unit per drug
inputs_small_3.groupby("label")["rateuom"].value_counts()

label                               rateuom    
Acetaminophen-IV                    mg/min           7680
Albumin 5%                          mL/hour          8270
D5 1/2NS                            mL/hour          6649
Dexmedetomidine (Precedex)          mcg/kg/hour     29355
Dextrose 5%                         mL/hour        177145
Fentanyl                            mcg/hour         3186
Fentanyl (Concentrate)              mcg/hour        31874
Furosemide (Lasix)                  mg/hour          4938
Heparin Sodium                      units/hour      16426
Hydralazine                         mg/hour             1
Hydromorphone (Dilaudid)            mg/hour          2021
Insulin - Regular                   units/hour      43150
KCL (Bolus)                         mL/hour         31499
LR                                  mL/hour         17502
Lorazepam (Ativan)                  mg/hour            34
Magnesium Sulfate (Bolus)           mL/hour         14053
Midazolam (Versed)      

For Mimic3 the data is cleaned here. There is not really documentation on what basis certain values are removed so we leave this open until first experimental results on the data.

We now split the entries which are spread in time.
We chose the duration window for the sampling. here we choose 30 minutes. So every entry which has a rate and with duration larger than 1 hour, we split it into fixed times injections.

In [13]:
#First check the /hours units
df_temp=inputs_small_3.loc[(inputs_small_3["rate"].notnull()) & (inputs_small_3["rateuom"].str.contains("mcg/kg/hour"))].copy()
df_temp["computed_amount"]=df_temp["rate"]*((pd.to_datetime(df_temp["endtime"])-pd.to_datetime(df_temp["starttime"])).dt.total_seconds()/3600)*df_temp["patientweight"]

assert(len(df_temp.loc[(abs(df_temp["computed_amount"]-1000*df_temp["amount"])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3["rate"].notnull()) & (inputs_small_3["rateuom"].str.contains("mL/hour"))].copy()
df_temp["computed_amount"]=df_temp["rate"]*((pd.to_datetime(df_temp["endtime"])-pd.to_datetime(df_temp["starttime"])).dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["computed_amount"]-df_temp["amount"])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3["rate"].notnull()) & (inputs_small_3["rateuom"].str.contains("mg/hour"))].copy()
df_temp["computed_amount"]=df_temp["rate"]*((pd.to_datetime(df_temp["endtime"])-pd.to_datetime(df_temp["starttime"])).dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["computed_amount"]-df_temp["amount"])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3["rate"].notnull()) & (inputs_small_3["rateuom"].str.contains("mcg/hour"))].copy()
df_temp["computed_amount"]=df_temp["rate"]*((pd.to_datetime(df_temp["endtime"])-pd.to_datetime(df_temp["starttime"])).dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["computed_amount"]-df_temp["amount"])>0.01)].index)==0) #OK

df_temp=inputs_small_3.loc[(inputs_small_3["rate"].notnull()) & (inputs_small_3["rateuom"].str.contains("units/hour"))].copy()
df_temp["computed_amount"]=df_temp["rate"]*((pd.to_datetime(df_temp["endtime"])-pd.to_datetime(df_temp["starttime"])).dt.total_seconds()/3600)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["computed_amount"]-df_temp["amount"])>0.01)].index)==0) #OK

In [14]:
df_temp=inputs_small_3.loc[(inputs_small_3["rate"].notnull()) & (inputs_small_3["rateuom"].str.contains("mg/min"))].copy()
df_temp["computed_amount"]=df_temp["rate"]*((pd.to_datetime(df_temp["endtime"])-pd.to_datetime(df_temp["starttime"])).dt.total_seconds()/60)

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["computed_amount"]-df_temp["amount"])>0.01)].index)==0) #OK

#Third check the kg/min units
df_temp=inputs_small_3.loc[(inputs_small_3["rate"].notnull()) & (inputs_small_3["rateuom"].str.contains("mcg/kg/min"))].copy()
df_temp["computed_amount"]=df_temp["rate"]*((pd.to_datetime(df_temp["endtime"])-pd.to_datetime(df_temp["starttime"])).dt.total_seconds()/60)*df_temp["patientweight"]

#Check with a 0.01 tolerance
assert(len(df_temp.loc[(abs(df_temp["computed_amount"]/1000-df_temp["amount"])>0.01)].index)==0) #OK


In [3]:
duration_split_hours=0.5
to_sec_fact=3600*duration_split_hours

#split data set in four.

#The first dataframe contains the entries with no rate but with extended duration inputs (over 0.5 hour)
df_temp1=inputs_small_3.loc[((pd.to_datetime(inputs_small_3["endtime"])-pd.to_datetime(inputs_small_3["starttime"]))>timedelta(hours=duration_split_hours)) & (inputs_small_3["rate"].isnull())].copy().reset_index(drop=True)
#The second dataframe contains the entries with no rate and low duration entries (<0.5hour)
df_temp2=inputs_small_3.loc[((pd.to_datetime(inputs_small_3["endtime"])-pd.to_datetime(inputs_small_3["starttime"]))<=timedelta(hours=duration_split_hours)) & (inputs_small_3["rate"].isnull())].copy().reset_index(drop=True)
#The third dataframe contains the entries with a rate and extended duration inputs (over 0.5 hour)
df_temp3=inputs_small_3.loc[((pd.to_datetime(inputs_small_3["endtime"])-pd.to_datetime(inputs_small_3["starttime"]))>timedelta(hours=duration_split_hours)) & (inputs_small_3["rate"].notnull())].copy().reset_index(drop=True)
#The forth dataframe contains the entries with a rate and low duration entries (< 0.5 hour)
df_temp4=inputs_small_3.loc[((pd.to_datetime(inputs_small_3["endtime"])-pd.to_datetime(inputs_small_3["starttime"]))<=timedelta(hours=duration_split_hours)) & (inputs_small_3["rate"].notnull())].copy().reset_index(drop=True)

#Check if split is complete
assert(len(df_temp1.index)+len(df_temp2.index)+len(df_temp3.index)+len(df_temp4.index)==len(inputs_small_3.index))

In [4]:
#We then process all of these dfs.
#In the first one, we need to duplicate the entries according to their duration and then divide each entry by the number of duplicates

#We duplicate the rows with the number bins for each injection
df_temp1["Repeat"]=np.ceil((pd.to_datetime(df_temp1["endtime"])-pd.to_datetime(df_temp1["starttime"])).dt.total_seconds()/to_sec_fact).astype(int)
df_new1=df_temp1.reindex(df_temp1.index.repeat(df_temp1["Repeat"]))

In [5]:
#We then create the admninistration time as a shifted version of the STARTTIME.
df_new1["charttime"]=df_new1.groupby(level=0)['starttime'].transform(lambda x: pd.date_range(start=x.iat[0],freq=str(60*duration_split_hours)+'min',periods=len(x)))
#We divide each entry by the number of repeats
df_new1["amount"]=df_new1["amount"]/df_new1["Repeat"]

In [6]:
# In the third one, we do the same
#We duplicate the rows with the number bins for each injection
df_temp3["Repeat"]=np.ceil((pd.to_datetime(df_temp3["endtime"])-pd.to_datetime(df_temp3["starttime"])).dt.total_seconds()/to_sec_fact).astype(int)
df_new3=df_temp3.reindex(df_temp3.index.repeat(df_temp3["Repeat"]))
#We then create the admninistration time as a shifted version of the STARTTIME.

In [7]:
df_new3["charttime"]=df_new3.groupby(level=0)['starttime'].transform(lambda x: pd.date_range(start=x.iat[0],freq=str(60*duration_split_hours)+'min',periods=len(x)))
#We divide each entry by the number of repeats
df_new3["amount"]=df_new3["amount"]/df_new3["Repeat"]

df_temp2["charttime"]=df_temp2["starttime"]
df_temp4["charttime"]=df_temp4["starttime"]

In [8]:
#Eventually, we merge all 4splits into one.
inputs_small_4=df_new1.append([df_temp2,df_new3,df_temp4])
#The result is a dataset with discrete inputs for each treatment.

Again, for Mimic3 the data is cleaned here. There is not really documentation on what basis certain values are removed so we leave this open until first experimental results on the data.

In [9]:
inputs_small_4.to_csv("/path/inputs_processed.csv")

In [10]:
inputs_small_4["hadm_id"].nunique()

16543