# Merging all data sources

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import numpy as np

Load all the previously processed data from the source files in the MIMIC4 dataset. <br/>Unify column naming across data sources and merge to one dataframe.

In [2]:
lab_df=pd.read_csv("/path/lab_processed.csv")[["subject_id","hadm_id","charttime","valuenum","label"]]
inputs_df=pd.read_csv("/path/inputs_processed.csv")[["subject_id","hadm_id","charttime","amount","label"]]
outputs_df=pd.read_csv("/path/outputs_processed.csv")[["subject_id","hadm_id","charttime","value","label"]]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
#Change the name of amount. Valuenum for every table
inputs_df["valuenum"]=inputs_df["amount"]
inputs_df.head()
inputs_df=inputs_df.drop(columns=["amount"]).copy()

#Change the name of amount. Valuenum for every table
outputs_df["valuenum"]=outputs_df["value"]
outputs_df=outputs_df.drop(columns=["value"]).copy()
outputs_df=outputs_df.loc[outputs_df["label"] == "Foley"]

#Tag to distinguish between lab and inputs events
inputs_df["Origin"]="Inputs"
lab_df["Origin"]="Lab"
outputs_df["Origin"]="Outputs"


#merge both dfs.
merged_df1=(inputs_df.append(lab_df)).reset_index()
merged_df2=(merged_df1.append(outputs_df)).reset_index()
merged_df=merged_df2.drop(columns="level_0")

#Check that all labels have different names.
assert(merged_df["label"].nunique()==(inputs_df["label"].nunique()+lab_df["label"].nunique()+outputs_df["label"].nunique()))

In [4]:
merged_df['charttime']=pd.to_datetime(merged_df["charttime"], format='%Y-%m-%d %H:%M:%S')
ref_time=merged_df.groupby("hadm_id")["charttime"].min()
merged_df_1=pd.merge(ref_time.to_frame(name="ref_time"),merged_df,left_index=True,right_on="hadm_id")
merged_df_1["time_stamp"]=merged_df_1["charttime"]-merged_df_1["ref_time"]
assert(len(merged_df_1.loc[merged_df_1["time_stamp"]<timedelta(hours=0)].index)==0)

Label Codes are then created to replace label name strings.

In [5]:
#Create a label code (int) for the labels.
label_dict=dict(zip(list(merged_df_1["label"].unique()),range(len(list(merged_df_1["label"].unique())))))
merged_df_1["label_code"]=merged_df_1["label"].map(label_dict)

merged_df_short=merged_df_1[["hadm_id","valuenum","time_stamp","label_code","Origin"]]

In [6]:
label_dict_df=pd.Series(merged_df_1["label"].unique()).reset_index()
label_dict_df.columns=["index","label"]
label_dict_df["label_code"]=label_dict_df["label"].map(label_dict)
label_dict_df.drop(columns=["index"],inplace=True)
label_dict_df.to_csv("/path/label_dict.csv")

In [7]:
merged_df_short["valuenum"] = merged_df_short["valuenum"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_short["valuenum"] = merged_df_short["valuenum"].astype(float)


We then select only the values within the first 48 hours.

In [8]:
merged_df_short=merged_df_short.loc[(merged_df_short["time_stamp"]<timedelta(hours=48))]
merged_df_short["time_stamp"] = merged_df_short["time_stamp"].dt.total_seconds().div(60).astype(int)
print("Number of patients considered: "+str(merged_df_short["hadm_id"].nunique()))
assert(len(merged_df_short.loc[merged_df_short["time_stamp"]>2880].index)==0)

Number of patients considered: 16572


We drop columns we don't need for the final dataset.

In [9]:
covariates = merged_df_short.loc[merged_df_short["Origin"] != "Lab"]
labevents = merged_df_short.loc[merged_df_short["Origin"] == "Lab"]

In [10]:
merged_df_short = labevents
merged_df_short.drop(["Origin"], axis=1, inplace=True)
merged_df_short

Unnamed: 0,hadm_id,valuenum,time_stamp,label_code
10461976,20000397.0,0.90,0,7
10461977,20000397.0,1.00,1670,7
10642280,20000397.0,2.10,0,8
10642281,20000397.0,2.10,1670,8
10985146,20000397.0,4.50,1670,10
...,...,...,...,...
13666308,29999828.0,7.42,2261,38
13786995,29999828.0,119.00,1095,39
13786996,29999828.0,69.00,2261,39
14055249,29999828.0,213.00,2250,23


We create a value- and a mask-column for each label and fill them with the data we have from the dataframe.

In [11]:
complete_df = merged_df_short
labels = complete_df["label_code"].unique()
value_columns = []
mask_columns  = []
for num in labels:
    name = "Value_label_" + str(num)
    name2 = "Mask_label_" + str(num)
    value_columns.append(name)
    mask_columns.append(name2)
    complete_df[name] = 0
    complete_df[name2] = 0
    complete_df[name] = complete_df[name].astype(float)

In [12]:
complete_df.dropna(inplace=True)
for index, row in complete_df.iterrows():
    name = "Value_label_" + str(row["label_code"].astype(int))
    name2 = "Mask_label_" + str(row["label_code"].astype(int))
    complete_df.at[index, name] = row["valuenum"]
    complete_df.at[index, name2] = 1

Drop columns that are not needed anymore and do a sanity check on the masks.

In [13]:
complete_df.drop(["valuenum", "label_code"], axis=1, inplace=True)
complete_df = complete_df.groupby(["hadm_id", "time_stamp"], as_index=False).max()
for x in mask_columns:
    assert(len(complete_df.loc[complete_df[x]>1])==0)
complete_df

Unnamed: 0,hadm_id,time_stamp,Value_label_7,Mask_label_7,Value_label_8,Mask_label_8,Value_label_10,Mask_label_10,Value_label_11,Mask_label_11,...,Value_label_34,Mask_label_34,Value_label_35,Mask_label_35,Value_label_36,Mask_label_36,Value_label_37,Mask_label_37,Value_label_39,Mask_label_39
0,20000397.0,0,0.9,1,2.1,1,4.4,1,138.0,1,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1,20000397.0,410,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
2,20000397.0,790,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
3,20000397.0,1229,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
4,20000397.0,1670,1.0,1,2.1,1,4.5,1,139.0,1,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154399,29999828.0,1040,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
154400,29999828.0,1095,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,1,30.0,1,0.0,0,62.0,1,119.0,1
154401,29999828.0,1229,0.9,1,1.4,1,4.7,1,135.0,1,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
154402,29999828.0,2250,0.8,1,1.5,1,4.3,1,137.0,1,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0


Assign a new unique ID to all admission ids and story mapping in a separate csv file.

In [14]:
unique_ids=np.arange(complete_df["hadm_id"].nunique())
np.random.shuffle(unique_ids)
d=dict(zip(complete_df["hadm_id"].unique(),unique_ids))

Unique_id_dict=pd.Series(complete_df["hadm_id"].unique()).reset_index().copy()
Unique_id_dict.columns=["index","hadm_id"]
Unique_id_dict["unique_id"]=Unique_id_dict["hadm_id"].map(d)
Unique_id_dict.to_csv("/path/unique_id_dict.csv")

unique_id_df = pd.read_csv("/path/unique_id_dict.csv")
d = dict(zip(unique_id_df["hadm_id"].values,unique_id_df["unique_id"].values))

Prepare dataframe for ODE dataset class

In [15]:
complete_df["unique_id"] = complete_df["hadm_id"].map(d)
complete_df

Unnamed: 0,hadm_id,time_stamp,Value_label_7,Mask_label_7,Value_label_8,Mask_label_8,Value_label_10,Mask_label_10,Value_label_11,Mask_label_11,...,Mask_label_34,Value_label_35,Mask_label_35,Value_label_36,Mask_label_36,Value_label_37,Mask_label_37,Value_label_39,Mask_label_39,unique_id
0,20000397.0,0,0.9,1,2.1,1,4.4,1,138.0,1,...,0,0.0,0,0.0,0,0.0,0,0.0,0,9147
1,20000397.0,410,0.0,0,0.0,0,0.0,0,0.0,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,9147
2,20000397.0,790,0.0,0,0.0,0,0.0,0,0.0,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,9147
3,20000397.0,1229,0.0,0,0.0,0,0.0,0,0.0,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,9147
4,20000397.0,1670,1.0,1,2.1,1,4.5,1,139.0,1,...,0,0.0,0,0.0,0,0.0,0,0.0,0,9147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154399,29999828.0,1040,0.0,0,0.0,0,0.0,0,0.0,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,13778
154400,29999828.0,1095,0.0,0,0.0,0,0.0,0,0.0,0,...,1,30.0,1,0.0,0,62.0,1,119.0,1,13778
154401,29999828.0,1229,0.9,1,1.4,1,4.7,1,135.0,1,...,0,0.0,0,0.0,0,0.0,0,0.0,0,13778
154402,29999828.0,2250,0.8,1,1.5,1,4.3,1,137.0,1,...,0,0.0,0,0.0,0,0.0,0,0.0,0,13778


In [16]:
complete_df.rename(columns={"unique_id": "ID", "time_stamp": "Time"}, inplace=True)
complete_df.drop(["hadm_id"], axis=1, inplace=True)
complete_df.set_index(["ID"], inplace=True)
complete_df

Unnamed: 0_level_0,Time,Value_label_7,Mask_label_7,Value_label_8,Mask_label_8,Value_label_10,Mask_label_10,Value_label_11,Mask_label_11,Value_label_12,...,Value_label_34,Mask_label_34,Value_label_35,Mask_label_35,Value_label_36,Mask_label_36,Value_label_37,Mask_label_37,Value_label_39,Mask_label_39
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9147,0,0.9,1,2.1,1,4.4,1,138.0,1,17.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
9147,410,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
9147,790,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
9147,1229,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
9147,1670,1.0,1,2.1,1,4.5,1,139.0,1,15.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13778,1040,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
13778,1095,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,1,30.0,1,0.0,0,62.0,1,119.0,1
13778,1229,0.9,1,1.4,1,4.7,1,135.0,1,13.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
13778,2250,0.8,1,1.5,1,4.3,1,137.0,1,10.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0


In [17]:
complete_df.to_csv("/path/full_dataset.csv")

# Creating MIMIC covariates

In [18]:
covariates.drop(["Origin"], axis=1, inplace=True)
covariates

Unnamed: 0,hadm_id,valuenum,time_stamp,label_code
457402,20000808.0,1.000000,0,26
515766,20000808.0,10.000000,0,27
653545,20000808.0,120.000000,737,0
946298,20000808.0,1.000000,618,29
1009178,20000808.0,10.000001,753,32
...,...,...,...,...
14432621,29999828.0,250.000000,2520,41
14432622,29999828.0,65.000000,2580,41
14432623,29999828.0,125.000000,2700,41
14432624,29999828.0,125.000000,2760,41


In [19]:
complete_df = covariates
labels = complete_df["label_code"].unique()
value_columns = []
for num in labels:
    name = "Value_label_" + str(num)
    value_columns.append(name)
    complete_df[name] = 0
    complete_df[name] = complete_df[name].astype(float)

In [20]:
complete_df.dropna(inplace=True)
for index, row in complete_df.iterrows():
    name = "Value_label_" + str(row["label_code"].astype(int))
    complete_df.at[index, name] = row["valuenum"]

In [21]:
complete_df.drop(["valuenum", "label_code"], axis=1, inplace=True)
complete_df = complete_df.groupby(["hadm_id", "time_stamp"], as_index=False).max()
complete_df

Unnamed: 0,hadm_id,time_stamp,Value_label_26,Value_label_27,Value_label_0,Value_label_29,Value_label_32,Value_label_42,Value_label_24,Value_label_25,...,Value_label_82,Value_label_83,Value_label_2,Value_label_85,Value_label_84,Value_label_87,Value_label_88,Value_label_89,Value_label_86,Value_label_48
0,20000808.0,0,1.0,10.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20000808.0,618,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20000808.0,737,0.0,0.0,120.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20000808.0,753,0.0,0.0,0.0,0.0,10.000001,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20001361.0,0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2149973,29999828.0,2773,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2149974,29999828.0,2799,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2149975,29999828.0,2820,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2149976,29999828.0,2829,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
unique_id_df = pd.read_csv("/path/unique_id_dict.csv")
d = dict(zip(unique_id_df["hadm_id"].values,unique_id_df["unique_id"].values))
complete_df["unique_id"] = complete_df["hadm_id"].map(d)

In [23]:
complete_df.rename(columns={"unique_id": "ID", "time_stamp": "Time"}, inplace=True)
complete_df.drop(["hadm_id"], axis=1, inplace=True)
complete_df.set_index(["ID"], inplace=True)

In [24]:
complete_df.to_csv("/path/covariates.csv")

In [25]:
complete_df

Unnamed: 0_level_0,Time,Value_label_26,Value_label_27,Value_label_0,Value_label_29,Value_label_32,Value_label_42,Value_label_24,Value_label_25,Value_label_43,...,Value_label_82,Value_label_83,Value_label_2,Value_label_85,Value_label_84,Value_label_87,Value_label_88,Value_label_89,Value_label_86,Value_label_48
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10938.0,0,1.0,10.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10938.0,618,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10938.0,737,0.0,0.0,120.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10938.0,753,0.0,0.0,0.0,0.0,10.000001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7274.0,0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13778.0,2773,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13778.0,2799,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13778.0,2820,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13778.0,2829,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
