In [1]:
import os
import pathlib
import pandas as pd
from datetime import timedelta


p_project = str(pathlib.Path(os.getcwd()).parents[1])
path_temp = p_project + '/data/mimic4'

In [2]:
lab_df = pd.read_csv(path_temp + '/processed/tables/lab_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'valuenum', 'label']]
inputs_df = pd.read_csv(path_temp + '/processed/tables/inputs_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'amount', 'label']]
outputs_df = pd.read_csv(path_temp + '/processed/tables/outputs_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'value', 'label']]
presc_df = pd.read_csv(path_temp + '/processed/tables/prescriptions_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'dose_val_rx', 'drug']]

# Change the name of amount. Valuenum for every table
inputs_df['valuenum'] = inputs_df['amount']
inputs_df = inputs_df.drop(columns=['amount']).copy()

outputs_df['valuenum'] = outputs_df['value']
outputs_df = outputs_df.drop(columns=['value']).copy()

presc_df['valuenum'] = presc_df['dose_val_rx']
presc_df = presc_df.drop(columns=['dose_val_rx']).copy()
presc_df['label'] = presc_df['drug']
presc_df = presc_df.drop(columns=['drug']).copy()

# Tag to distinguish between lab and inputs events
inputs_df['Origin'] = 'Inputs'
lab_df['Origin'] = 'Lab'
outputs_df['Origin'] = 'Outputs'
presc_df['Origin'] = 'Prescriptions'

# merge both dfs.
merged_df1 = (inputs_df.append(lab_df)).reset_index()
merged_df2 = (merged_df1.append(outputs_df)).reset_index()
merged_df2.drop(columns='level_0', inplace=True)
merged_df = (merged_df2.append(presc_df)).reset_index()

# Check that all labels have different names.
assert(merged_df['label'].nunique() == (inputs_df['label'].nunique(
)+lab_df['label'].nunique()+outputs_df['label'].nunique()+presc_df['label'].nunique()))

# set the timestamp as the time delta between the first chart time for each admission
merged_df['charttime'] = pd.to_datetime(
    merged_df['charttime'], format='%Y-%m-%d %H:%M:%S')
ref_time = merged_df.groupby('hadm_id')['charttime'].min()
merged_df_1 = pd.merge(ref_time.to_frame(name='ref_time'),
                       merged_df, left_index=True, right_on='hadm_id')
merged_df_1['time_stamp'] = merged_df_1['charttime']-merged_df_1['ref_time']
assert(len(merged_df_1.loc[merged_df_1['time_stamp']
       < timedelta(hours=0)].index) == 0)

# Create a label code (int) for the labels.
label_dict = dict(zip(list(merged_df_1['label'].unique()), range(
    len(list(merged_df_1['label'].unique())))))
merged_df_1['label_code'] = merged_df_1['label'].map(label_dict)

label_dict_df = pd.Series(merged_df_1['label'].unique()).reset_index()
label_dict_df.columns = ['index', 'label']
label_dict_df['label_code'] = label_dict_df['label'].map(label_dict)
label_dict_df.drop(columns=['index'], inplace=True)
label_dict_df.to_csv(path_temp + '/processed/tables/label_dict.csv')

# select only values within first 24 hours
merged_df_short = merged_df_1[['hadm_id', 'valuenum', 'time_stamp', 'label_code', 'Origin']].rename(
    columns={'hadm_id': 'ID', 'time_stamp': 'Time'})
merged_df_short = merged_df_short.loc[(
    merged_df_short['Time'] < timedelta(hours=24))]

# The sampling interval is 1 minute
merged_df_short['Time'] = merged_df_short['Time'].dt.total_seconds().div(
    60).astype(int)
assert(len(merged_df_short.loc[merged_df_short['Time'] > 1440].index) == 0)

# drop columns that are not needed for final dataset
merged_df_short.drop(['Origin'], axis=1, inplace=True)
complete_df = merged_df_short

# create value- and mask- columns and fill with data
labels = complete_df['label_code'].unique()
value_columns = []
mask_columns = []
for num in labels:
    name = 'Value_label_' + str(num)
    name2 = 'Mask_label_' + str(num)
    value_columns.append(name)
    mask_columns.append(name2)
    complete_df[name] = 0.0
    complete_df[name2] = 0
    # complete_df[name] = complete_df[name].astype(float)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  complete_df[name] = 0.0
  complete_df[name2] = 0


In [3]:
complete_df.shape

(7253647, 196)

In [4]:
complete_df.dropna(inplace=True)
complete_df.shape

(7227943, 196)

In [5]:
for index, row in complete_df.iterrows():
    name = 'Value_label_' + str(row['label_code'].astype(int))
    name2 = 'Mask_label_' + str(row['label_code'].astype(int))
    complete_df.at[index, name] = row['valuenum']
    complete_df.at[index, name2] = 1

In [6]:
# drop all unneccesary columns and do sanity check
complete_df.drop(['valuenum', 'label_code'], axis=1, inplace=True)

# If there are multiple values for the same time stamp, take the maximum
complete_df_gb = complete_df.groupby(['ID', 'Time'], as_index=False).max()

In [7]:
for x in mask_columns:
    assert(len(complete_df_gb.loc[complete_df_gb[x] > 1]) == 0)
complete_df_gb['ID'] = complete_df_gb['ID'].astype(int)

complete_df_gb['ID'].nunique()

55181

In [8]:
complete_df_gb.to_csv(
    path_temp + '/processed/tables/mimic4_full_dataset.csv', index=False)