# Preparing the MIMIC 3 dataset

In [3]:
import os
import pathlib

p_project = str(pathlib.Path(os.getcwd()).parents[1])

In [4]:
from datetime import timedelta

import pandas as pd
import numpy as np

In [6]:
lab_df=pd.read_csv(p_project + "/data/mimic3/processed/lab_processed.csv")[["SUBJECT_ID","HADM_ID","CHARTTIME","VALUENUM","LABEL"]]
inputs_df=pd.read_csv(p_project + "/data/mimic3/processed/inputs_processed.csv")[["SUBJECT_ID","HADM_ID","CHARTTIME","AMOUNT","LABEL"]]
outputs_df=pd.read_csv(p_project + "/data/mimic3/processed/outputs_processed.csv")[["SUBJECT_ID","HADM_ID","CHARTTIME","VALUE","LABEL"]]
presc_df=pd.read_csv(p_project + "/data/mimic3/processed/prescriptions_processed.csv")[["SUBJECT_ID","HADM_ID","CHARTTIME","DOSE_VAL_RX","DRUG"]]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
inputs_df["VALUENUM"]=inputs_df["AMOUNT"]
inputs_df=inputs_df.drop(columns=["AMOUNT"]).copy()

outputs_df["VALUENUM"]=outputs_df["VALUE"]
outputs_df=outputs_df.drop(columns=["VALUE"]).copy()

presc_df["VALUENUM"]=presc_df["DOSE_VAL_RX"]
presc_df=presc_df.drop(columns=["DOSE_VAL_RX"]).copy()
presc_df["LABEL"]=presc_df["DRUG"]
presc_df=presc_df.drop(columns=["DRUG"]).copy()

inputs_df["Origin"]="Inputs"
lab_df["Origin"]="Lab"
outputs_df["Origin"]="Outputs"
presc_df["Origin"]="Prescriptions"

merged_df1=(inputs_df.append(lab_df)).reset_index()
merged_df2=(merged_df1.append(outputs_df)).reset_index()
merged_df2.drop(columns="level_0",inplace=True)
merged_df=(merged_df2.append(presc_df)).reset_index()

assert(merged_df["LABEL"].nunique()==(inputs_df["LABEL"].nunique()+lab_df["LABEL"].nunique()+outputs_df["LABEL"].nunique()+presc_df["LABEL"].nunique()))

In [8]:
merged_df.head()

Unnamed: 0,level_0,index,SUBJECT_ID,HADM_ID,CHARTTIME,LABEL,VALUENUM,Origin
0,0,0.0,27063,139787.0,2133-02-05 06:29:00,Potassium Chloride,1.354906,Inputs
1,1,1.0,27063,139787.0,2133-02-05 06:59:00,Potassium Chloride,1.354906,Inputs
2,2,2.0,27063,139787.0,2133-02-05 07:29:00,Potassium Chloride,1.354906,Inputs
3,3,3.0,27063,139787.0,2133-02-05 07:59:00,Potassium Chloride,1.354906,Inputs
4,4,4.0,27063,139787.0,2133-02-05 08:29:00,Potassium Chloride,1.354906,Inputs


In [9]:
merged_df['CHARTTIME']=pd.to_datetime(merged_df["CHARTTIME"], format='%Y-%m-%d %H:%M:%S')
ref_time=merged_df.groupby("HADM_ID")["CHARTTIME"].min()

merged_df_1=pd.merge(ref_time.to_frame(name="REF_TIME"),merged_df,left_index=True,right_on="HADM_ID")
merged_df_1["TIME_STAMP"]=merged_df_1["CHARTTIME"]-merged_df_1["REF_TIME"]
assert(len(merged_df_1.loc[merged_df_1["TIME_STAMP"]<timedelta(hours=0)].index)==0)

In [11]:
label_dict=dict(zip(list(merged_df_1["LABEL"].unique()),range(len(list(merged_df_1["LABEL"].unique())))))
merged_df_1["LABEL_CODE"]=merged_df_1["LABEL"].map(label_dict)

merged_df_short=merged_df_1[["HADM_ID","VALUENUM","TIME_STAMP","LABEL_CODE","Origin"]]

In [12]:
label_dict_df=pd.Series(merged_df_1["LABEL"].unique()).reset_index()
label_dict_df.columns=["index","LABEL"]
label_dict_df["LABEL_CODE"]=label_dict_df["LABEL"].map(label_dict)
label_dict_df.drop(columns=["index"],inplace=True)
label_dict_df.to_csv(p_project + "/data/mimic3/processed/lab_label_dict.csv")

In [13]:
merged_df_short["VALUENUM"] = merged_df_short["VALUENUM"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_short["VALUENUM"] = merged_df_short["VALUENUM"].astype(float)


In [14]:
merged_df_short=merged_df_short.loc[(merged_df_short["TIME_STAMP"]<pd.Timedelta("48 hours"))]

merged_df_short["TIME_STAMP"] = merged_df_short["TIME_STAMP"].dt.total_seconds().div(60).astype(int)

print("Number of patients considered :"+str(merged_df_short["HADM_ID"].nunique()))

Number of patients considered :24681


In [17]:
complete_df = merged_df_short
labels = complete_df["LABEL_CODE"].unique()
value_columns = []
mask_columns  = []
for num in labels:
    name = "Value_label_" + str(num)
    name2 = "Mask_label_" + str(num)
    value_columns.append(name)
    mask_columns.append(name2)
    complete_df[name] = 0
    complete_df[name2] = 0
    complete_df[name] = complete_df[name].astype(float)

  complete_df[name] = 0
  complete_df[name2] = 0


In [18]:
complete_df.dropna(inplace=True)
for index, row in complete_df.iterrows():
    name = "Value_label_" + str(row["LABEL_CODE"])
    name2 = "Mask_label_" + str(row["LABEL_CODE"])
    complete_df.at[index, name] = row["VALUENUM"]
    complete_df.at[index, name2] = 1

In [19]:
complete_df.shape

(3858805, 197)

In [20]:
complete_df.head()

Unnamed: 0,HADM_ID,VALUENUM,TIME_STAMP,LABEL_CODE,Origin,Value_label_0,Mask_label_0,Value_label_1,Mask_label_1,Value_label_2,...,Value_label_92,Mask_label_92,Value_label_95,Mask_label_95,Value_label_94,Mask_label_94,Value_label_93,Mask_label_93,Value_label_91,Mask_label_91
4824329,100007.0,15.0,0,0,Lab,15.0,1,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
4824330,100007.0,12.0,791,0,Lab,12.0,1,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
4824331,100007.0,10.0,1770,0,Lab,10.0,1,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
5090579,100007.0,25.0,0,1,Lab,0.0,0,25.0,1,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
5090580,100007.0,25.0,791,1,Lab,0.0,0,25.0,1,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0


In [21]:
complete_df = complete_df.groupby(["HADM_ID", "TIME_STAMP"], as_index=False).max()
for x in mask_columns:
    assert(len(complete_df.loc[complete_df[x]>1])==0)
complete_df

Unnamed: 0,HADM_ID,TIME_STAMP,VALUENUM,LABEL_CODE,Origin,Value_label_0,Mask_label_0,Value_label_1,Mask_label_1,Value_label_2,...,Value_label_92,Mask_label_92,Value_label_95,Mask_label_95,Value_label_94,Mask_label_94,Value_label_93,Mask_label_93,Value_label_91,Mask_label_91
0,100007.0,0,224.00,36,Lab,15.0,1,25.0,1,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1,100007.0,140,8.00,37,Lab,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
2,100007.0,165,4.00,40,Prescriptions,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
3,100007.0,209,3.10,31,Lab,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
4,100007.0,791,259.00,27,Lab,12.0,1,25.0,1,8.9,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129299,199999.0,2035,277.00,27,Lab,13.0,1,23.0,1,7.9,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1129300,199999.0,2520,5.00,18,Lab,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1129301,199999.0,2546,7.44,33,Lab,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1129302,199999.0,2564,74.00,34,Lab,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0


In [22]:
complete_df = complete_df.rename(columns={"HADM_ID": "hadm_id", "TIME_STAMP": "time_stamp"})
complete_df.drop(["Origin"], axis=1, inplace=True)

In [23]:
complete_df.to_csv(p_project + "/data/mimic3/processed/mimic3_full_dataset.csv")