# Merging all data sources

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import numpy as np

Load all the previously processed data from the source files in the MIMIC4 dataset. <br/>Unify column naming across data sources and merge to one dataframe.

In [2]:
lab_df=pd.read_csv("/path/lab_processed.csv")[["subject_id","hadm_id","charttime","valuenum","label"]]
inputs_df=pd.read_csv("/path/inputs_processed.csv")[["subject_id","hadm_id","charttime","amount","label"]]
outputs_df=pd.read_csv("/path/outputs_processed.csv")[["subject_id","hadm_id","charttime","value","label"]]
presc_df=pd.read_csv("/path/prescriptions_processed.csv")[["subject_id","hadm_id","charttime","dose_val_rx","drug"]]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
#Change the name of amount. Valuenum for every table
inputs_df["valuenum"]=inputs_df["amount"]
inputs_df.head()
inputs_df=inputs_df.drop(columns=["amount"]).copy()

#Change the name of amount. Valuenum for every table
outputs_df["valuenum"]=outputs_df["value"]
outputs_df=outputs_df.drop(columns=["value"]).copy()

#Change the name of amount. Valuenum for every table
presc_df["valuenum"]=presc_df["dose_val_rx"]
presc_df=presc_df.drop(columns=["dose_val_rx"]).copy()
presc_df["label"]=presc_df["drug"]
presc_df=presc_df.drop(columns=["drug"]).copy()
# remove weird values from dataframe
presc_df = presc_df.drop((presc_df['valuenum']=='3-10').index)


#Tag to distinguish between lab and inputs events
inputs_df["Origin"]="Inputs"
lab_df["Origin"]="Lab"
outputs_df["Origin"]="Outputs"
presc_df["Origin"]="Prescriptions"


#merge both dfs.
merged_df1=(inputs_df.append(lab_df)).reset_index()
merged_df2=(merged_df1.append(outputs_df)).reset_index()
merged_df2.drop(columns="level_0",inplace=True)
merged_df=(merged_df2.append(presc_df)).reset_index()

#merged_df=lab_df.reset_index()

#Check that all labels have different names.
assert(merged_df["label"].nunique()==(inputs_df["label"].nunique()+lab_df["label"].nunique()+outputs_df["label"].nunique()+presc_df["label"].nunique()))

In [5]:
merged_df.head()

Unnamed: 0,level_0,index,subject_id,hadm_id,charttime,label,valuenum,Origin
0,0,0.0,13859862,25015072.0,2152-04-07 18:30:00,Potassium Chloride,10.0,Inputs
1,1,1.0,13859862,25015072.0,2152-04-07 19:00:00,Potassium Chloride,10.0,Inputs
2,2,2.0,13859862,25015072.0,2152-04-08 12:00:00,Potassium Chloride,10.0,Inputs
3,3,3.0,13859862,25015072.0,2152-04-08 12:30:00,Potassium Chloride,10.0,Inputs
4,4,4.0,13859862,25015072.0,2152-04-09 15:53:00,Potassium Chloride,10.0,Inputs


We set the timestamp as the time delta between the first chart time for each admission.

In [6]:
merged_df['charttime']=pd.to_datetime(merged_df["charttime"], format='%Y-%m-%d %H:%M:%S')
ref_time=merged_df.groupby("hadm_id")["charttime"].min()
merged_df_1=pd.merge(ref_time.to_frame(name="ref_time"),merged_df,left_index=True,right_on="hadm_id")
merged_df_1["time_stamp"]=merged_df_1["charttime"]-merged_df_1["ref_time"]
assert(len(merged_df_1.loc[merged_df_1["time_stamp"]<timedelta(hours=0)].index)==0)

Label Codes are then created to replace label name strings.

In [7]:
#Create a label code (int) for the labels.
label_dict=dict(zip(list(merged_df_1["label"].unique()),range(len(list(merged_df_1["label"].unique())))))
merged_df_1["label_code"]=merged_df_1["label"].map(label_dict)

merged_df_short=merged_df_1[["hadm_id","valuenum","time_stamp","label_code","Origin"]]

In [8]:
label_dict_df=pd.Series(merged_df_1["label"].unique()).reset_index()
label_dict_df.columns=["index","label"]
label_dict_df["label_code"]=label_dict_df["label"].map(label_dict)
label_dict_df.drop(columns=["index"],inplace=True)
label_dict_df.to_csv("/path/label_dict.csv")

In [9]:
merged_df_short["valuenum"] = merged_df_short["valuenum"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_short["valuenum"] = merged_df_short["valuenum"].astype(float)


We then select only the values within the first 48 hours.

In [10]:
merged_df_short=merged_df_short.loc[(merged_df_short["time_stamp"]<timedelta(hours=48))]
merged_df_short["time_stamp"] = merged_df_short["time_stamp"].dt.total_seconds().div(60).astype(int)
print("Number of patients considered: "+str(merged_df_short["hadm_id"].nunique()))
assert(len(merged_df_short.loc[merged_df_short["time_stamp"]>2880].index)==0)

Number of patients considered: 16572


We drop columns we don't need for the final dataset.

In [11]:
merged_df_short.drop(["Origin"], axis=1, inplace=True)
merged_df_short

Unnamed: 0,hadm_id,valuenum,time_stamp,label_code
10461976,20000397.0,0.9,0,7
10461977,20000397.0,1.0,1670,7
10642280,20000397.0,2.1,0,8
10642281,20000397.0,2.1,1670,8
10985146,20000397.0,4.5,1670,10
...,...,...,...,...
14432622,29999828.0,65.0,2580,42
14432623,29999828.0,125.0,2700,42
14432624,29999828.0,125.0,2760,42
14432625,29999828.0,60.0,2820,42


In [13]:
merged_df_short.to_csv("/path/complete_df(intermediate).csv")

There are some memory issues where the kernel dies even though the memory of the process is set high enough. One workaround is to read the complete_df csv in 3 chunks, process them seperately and append them afterwards.

In [None]:
complete_df = pd.read_csv("/path/complete_df(intermediate).csv")

We create a value- and a mask-column for each label and fill them with the data we have from the dataframe.

In [None]:
labels = complete_df["label_code"].unique()
value_columns = []
mask_columns  = []
for num in labels:
    name = "Value_label_" + str(num)
    name2 = "Mask_label_" + str(num)
    value_columns.append(name)
    mask_columns.append(name2)
    complete_df[name] = 0
    complete_df[name2] = 0
    complete_df[name] = complete_df[name].astype(float)

In [4]:
complete_df.dropna(inplace=True)
for index, row in complete_df.iterrows():
    name = "Value_label_" + str(row["label_code"].astype(int))
    name2 = "Mask_label_" + str(row["label_code"].astype(int))
    complete_df.at[index, name] = row["valuenum"]
    complete_df.at[index, name2] = 1

Drop columns that are not needed anymore and do a sanity check on the masks.

In [None]:
complete_df.drop(["valuenum", "label_code"], axis=1, inplace=True)
complete_df = complete_df.groupby(["hadm_id", "time_stamp"], as_index=False).max()
for x in mask_columns:
    assert(len(complete_df.loc[complete_df[x]>1])==0)
complete_df


Assign a new unique ID to all admission ids and story mapping in a separate csv file.

In [5]:
unique_ids=np.arange(complete_df["hadm_id"].nunique())
np.random.shuffle(unique_ids)
d=dict(zip(complete_df["hadm_id"].unique(),unique_ids))

Unique_id_dict=pd.Series(complete_df["hadm_id"].unique()).reset_index().copy()
Unique_id_dict.columns=["index","hadm_id"]
Unique_id_dict["unique_id"]=Unique_id_dict["hadm_id"].map(d)
Unique_id_dict.to_csv("/path/unique_id_dict.csv")

unique_id_df = pd.read_csv("/path/unique_id_dict.csv")
d = dict(zip(unique_id_df["hadm_id"].values,unique_id_df["unique_id"].values))

Create labels for classification task, target: death tag.

In [6]:
admissions=pd.read_csv("/path/admissions_processed.csv")
death_tags_s=admissions.groupby("hadm_id")["deathtag"].unique().astype(int).to_frame().reset_index()
death_tags_df=death_tags_s.loc[death_tags_s["hadm_id"].isin(complete_df["hadm_id"])].copy()
death_tags_df["unique_id"]=death_tags_df["hadm_id"].map(d)
death_tags_df.sort_values(by="unique_id",inplace=True)
death_tags_df.rename(columns={"deathtag":"Value"},inplace=True)
death_tags_df.set_index("unique_id", inplace=True)
death_tags_df.drop("hadm_id", axis=1, inplace=True)
death_tags_df.to_csv("/path/complete_death_tags.csv")

complete_df["unique_id"] = complete_df["hadm_id"].map(d)

Prepare dataframe for ODE dataset class

In [13]:
complete_df.rename(columns={"unique_id": "ID", "time_stamp": "Time"}, inplace=True)
complete_df.drop(["hadm_id"], axis=1, inplace=True)
complete_df.set_index(["ID"], inplace=True)
complete_df

Unnamed: 0_level_0,Time,Value_label_7,Mask_label_7,Value_label_8,Mask_label_8,Value_label_10,Mask_label_10,Value_label_11,Mask_label_11,Value_label_12,...,Value_label_52,Mask_label_52,Value_label_45,Mask_label_45,Value_label_99,Mask_label_99,Value_label_100,Mask_label_100,Value_label_101,Mask_label_101
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16039,0,0.9,1,2.1,1,4.4,1,138.0,1,17.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
16039,410,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
16039,790,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
16039,1229,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
16039,1670,1.0,1,2.1,1,4.5,1,139.0,1,15.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7013,2773,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
7013,2799,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
7013,2820,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
7013,2829,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0


In [14]:
complete_df.to_csv("/path/full_dataset.csv")