In [None]:
# first run code from https://github.com/USC-Melady/Benchmarking_DL_MIMICIII to get the pre-processed files

# import
import numpy as np
import sklearn
from sklearn import metrics
import pandas as pd
import datetime

# root_path: where Benchmarking_DL_MIMICIII code is run/stored
# res_path: where resulting data should be saved
root_path = ... # to be filled
res_path = ... # to be filled

# load data
header_mimic_input =  ['age',
                 'heartrate_max','heartrate_min',
                 'sysbp_max','sysbp_min',
                 'tempc_max','tempc_min',
                 'pao2fio2_vent_min',
                 'urineoutput',
                 'bun_min','bun_max',
                 'wbc_min','wbc_max',
                 'potassium_min','potassium_max',
                 'sodium_min','sodium_max',
                 'bicarbonate_min','bicarbonate_max',
                 'bilirubin_min','bilirubin_max',
                 'mingcs',
                 'aids',
                 'hem',
                 'mets',
                 'admissiontype']
header_mimic_output =  ['hospital_mort',
                 '1_day_mort',
                 '2_day_mort',
                 '3_day_mort',
                 '30_day_mort',
                 '1_year_mort',]

# input file - non-series
df_input = pd.read_csv(root_path + 'data/admdata_17f/24hrs/non_series/input.csv', header=None)
df_input.to_csv('input_mimic.csv', header=header_mimic_input, index=False)
df_input = pd.read_csv('input_mimic.csv')

In [None]:
# get valid admissions
data = np.load(root_path + 'data/admdata_17f/24hrs/DB_merged_24hrs.npy',allow_pickle=True)
valid_aids = [t[0][-1] for t in data]
print(len(valid_aids))

In [None]:
# use valid admissions to filter
df = pd.read_csv(root_path + 'data/17features_first24h.csv')
print(len(df))
df = df[df['hadm_id'].isin(valid_aids)]
print(len(df))
df = df.drop_duplicates('subject_id')
print(len(df))

# drop columns
df = df.drop(columns=['intime', 'outtime'])

# check the columns
df.loc[df['admissiontype'] == 'Medical', 'admissiontype'] = 0
df.loc[df['admissiontype'] == 'ScheduledSurgical', 'admissiontype'] = 1
df.loc[df['admissiontype'] == 'UnscheduledSurgical', 'admissiontype'] = 2

# iterate over columns,
for column in range(len(df.columns)):
    # initialize new vector
    curr_col = df[df.columns[column]]
    # check if categorical
    if (df.columns[column] == 'mingcs' or df.columns[column] == 'admissiontype'):
        print(df.columns[column], curr_col.isnull().sum(),round(curr_col.mean()))
        curr_col.fillna(round(curr_col.mean()),inplace=True) 
    elif (df.columns[column] == 'aids' or df.columns[column] == 'hem' or df.columns[column] == 'mets'): 
        print(df.columns[column], curr_col.isnull().sum())
        curr_col.fillna(0,inplace=True) 
    else:
        print(df.columns[column], curr_col.isnull().sum(),curr_col.mean())
        curr_col.fillna(curr_col.mean(),inplace=True)        

In [None]:
# get gender info from PATIENTS.csv from MIMIC-III (change the path if it is stored somewhere else)
data_gender = pd.read_csv(root_path + 'PATIENTS.csv')
data_gender = data_gender[data_gender['SUBJECT_ID'].isin(df['subject_id'])][['SUBJECT_ID','GENDER']]
data_gender = data_gender.rename(columns={"SUBJECT_ID": "subject_id", "GENDER": "gender"})
print(len(data_gender))
df = pd.merge(df, data_gender, on='subject_id')

In [None]:
# get other infos from ADMISSIONS.csv from MIMIC-III (change the path if it is stored somewhere else)
data_others = pd.read_csv(root_path + 'ADMISSIONS.csv')
data_others = data_others[data_others['SUBJECT_ID'].isin(df['subject_id'])][['SUBJECT_ID','INSURANCE','RELIGION','MARITAL_STATUS','ETHNICITY']]
data_others = data_others.rename(columns={"SUBJECT_ID": "subject_id", 
                                         "INSURANCE": "insurance",
                                         "RELIGION": "religion",
                                         "MARITAL_STATUS": "marital_status",
                                         "ETHNICITY": "ethnicity"})
data_others = data_others.drop_duplicates('subject_id')
df = pd.merge(df, data_others, on='subject_id')

# drop columns
df = df.drop(columns=['subject_id', 'hadm_id', 'icustay_id'])

# save the data
df.to_csv(res_path + 'mimic_non_series.csv', index=False)
print(len(df))

In [None]:
# prediction tasks
df_output = pd.read_csv(root_path + 'data/admdata_17f/24hrs/non_series/output.csv', header=None)
df_output.to_csv('output_mimic.csv', header=header_mimic_output, index=False)
df_output = pd.read_csv('output_mimic.csv')
print(len(df_output))

# merge
df = pd.concat([df, df_output], axis=1)

# save the data
df.to_csv(res_path + 'mimic_non_series.csv', index=False)
print(df)