In [2]:
import os
import pathlib
import shutil
from datetime import timedelta

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
path_mimic4 = pathlib.Path(os.getcwd()).parents[1]/'data/mimic4'
path_processed = path_mimic4/'processed'

In [3]:
lab_df = pd.read_csv(path_processed/'tables/lab_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'valuenum', 'label']]
inputs_df = pd.read_csv(path_processed/'tables/inputs_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'amount', 'label']]
outputs_df = pd.read_csv(path_processed/'tables/outputs_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'value', 'label']]
presc_df = pd.read_csv(path_processed/'tables/prescriptions_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'dose_val_rx', 'drug']]

# Change the name of amount. Valuenum for every table
inputs_df['valuenum'] = inputs_df['amount']
inputs_df = inputs_df.drop(columns=['amount']).copy()

outputs_df['valuenum'] = outputs_df['value']
outputs_df = outputs_df.drop(columns=['value']).copy()

presc_df['valuenum'] = presc_df['dose_val_rx']
presc_df = presc_df.drop(columns=['dose_val_rx']).copy()
presc_df['label'] = presc_df['drug']
presc_df = presc_df.drop(columns=['drug']).copy()

# Tag to distinguish between lab and inputs events
inputs_df['Origin'] = 'Inputs'
lab_df['Origin'] = 'Lab'
outputs_df['Origin'] = 'Outputs'
presc_df['Origin'] = 'Prescriptions'

merged_df = pd.concat([inputs_df, lab_df, outputs_df, presc_df]).reset_index()

# Check that all labels have different names.
assert(merged_df['label'].nunique() == (inputs_df['label'].nunique(
)+lab_df['label'].nunique()+outputs_df['label'].nunique()+presc_df['label'].nunique()))

# set the timestamp as the time delta between the first chart time for each admission
merged_df['charttime'] = pd.to_datetime(
    merged_df['charttime'], format='%Y-%m-%d %H:%M:%S')
ref_time = merged_df.groupby('hadm_id')['charttime'].min()
merged_df_1 = pd.merge(ref_time.to_frame(name='ref_time'),
                       merged_df, left_index=True, right_on='hadm_id')
merged_df_1['time_stamp'] = merged_df_1['charttime']-merged_df_1['ref_time']
assert(len(merged_df_1.loc[merged_df_1['time_stamp']
       < timedelta(hours=0)].index) == 0)

# Create a label code (int) for the labels.
label_dict = dict(zip(list(merged_df_1['label'].unique()), range(
    len(list(merged_df_1['label'].unique())))))
merged_df_1['label_code'] = merged_df_1['label'].map(label_dict)

  inputs_df = pd.read_csv(path_processed/'tables/inputs_processed.csv')[
  presc_df = pd.read_csv(path_processed/'tables/prescriptions_processed.csv')[


In [4]:
merged_df_short = merged_df_1[['hadm_id', 'valuenum', 'time_stamp', 'label_code', 'Origin']].rename(
    columns={'hadm_id': 'ID', 'time_stamp': 'Time'})

# Make sure that the selected admissions have observations after 24 hours
ids_selected = merged_df_short[(merged_df_short['Time'] > timedelta(hours=24)) & 
                               (merged_df_short['Time'] < timedelta(hours=48))]['ID'].unique()

# select only values within first 48 hours
merged_df_short = merged_df_short[merged_df_short['ID'].isin(ids_selected)].loc[(merged_df_short['Time'] < timedelta(hours=48))]

merged_df_short['Time'] = merged_df_short['Time'].dt.total_seconds().div(60).astype(int)
assert(len(merged_df_short.loc[merged_df_short['Time'] > 2880].index) == 0)

# drop columns that are not needed for final dataset
merged_df_short.drop(['Origin'], axis=1, inplace=True)
complete_df = merged_df_short


In [5]:
# create value- and mask- columns and fill with data
labels = complete_df['label_code'].unique()
value_columns = []
mask_columns = []
for num in labels:
    name = 'Value_label_' + str(num)
    name2 = 'Mask_label_' + str(num)
    value_columns.append(name)
    mask_columns.append(name2)
    complete_df[name] = 0.0
    complete_df[name2] = 0


  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  com

In [6]:
for index, row in complete_df.iterrows():
    name = 'Value_label_' + str(row['label_code'].astype(int))
    name2 = 'Mask_label_' + str(row['label_code'].astype(int))
    complete_df.at[index, name] = row['valuenum']
    complete_df.at[index, name2] = 1

# drop all unneccesary columns and do sanity check
complete_df.drop(['valuenum', 'label_code'], axis=1, inplace=True)

# If there are multiple values for the same time stamp, take the maximum
mimic4_df = complete_df.groupby(['ID', 'Time'], as_index=False).max()

for x in mask_columns:
    assert(len(mimic4_df.loc[mimic4_df[x] > 1]) == 0)
mimic4_df['ID'] = mimic4_df['ID'].astype(int)

mimic4_df.set_index(['ID'], inplace=True)
mimic4_df.dropna(inplace=False)
mimic4_df.to_csv(path_processed/'mimic4_full_dataset_next.csv')

  mimic4_df = complete_df.groupby(['ID', 'Time'], as_index=False).max()
  mimic4_df = complete_df.groupby(['ID', 'Time'], as_index=False).max()


In [4]:
random_state = 1
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if not os.path.exists(path_r):
    os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset_next.csv', index_col='ID')

# Create Next tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Next test dataset 
next_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_next_test_df = mimic4_df.loc[next_test_ids['ID']]
mimic4_next_test_df.to_csv(path_r/'data_mimic4_next_test.csv')

# Normal next train & validation dataset 
next_tv_ids = tvt_ids.drop(next_test_ids.index).sample(n=num_large_m4, random_state=random_state)
next_train_ids, next_valid_ids = train_test_split(next_tv_ids, test_size=0.2, random_state=random_state)
mimic4_next_train_df = mimic4_df.loc[next_train_ids['ID']]
mimic4_next_valid_df = mimic4_df.loc[next_valid_ids['ID']]
mimic4_next_train_df.to_csv(path_r/'m4_next_train.csv')
mimic4_next_valid_df.to_csv(path_r/'m4_next_valid.csv')

def generate_few_shot_datasets_next(num, ids, path_save):
    few_shot = ids.sample(n=num, random_state=random_state)['ID'].to_list()
    few_shot_train, few_shot_valid = train_test_split(few_shot, test_size=0.2, random_state=random_state)
    few_shot_train_df=mimic4_df.loc[few_shot_train].reset_index()
    few_shot_valid_df=mimic4_df.loc[few_shot_valid].reset_index()
    few_shot_train_df.to_csv(path_save/'m4_next_{}_train.csv'.format(num), index=False)
    few_shot_valid_df.to_csv(path_save/'m4_next_{}_valid.csv'.format(num), index=False)
    
next_tv_ids = tvt_ids.drop(next_test_ids.index)
generate_few_shot_datasets_next(num=100, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=250, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=500, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=1000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=2000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=3000, ids=next_tv_ids, path_save=path_r)

In [None]:
random_state = 2
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if not os.path.exists(path_r):
    os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset_next.csv', index_col='ID')

# Create Next tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Next test dataset 
next_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_next_test_df = mimic4_df.loc[next_test_ids['ID']]
mimic4_next_test_df.to_csv(path_r/'data_mimic4_next_test.csv')

# Normal next train & validation dataset 
next_tv_ids = tvt_ids.drop(next_test_ids.index).sample(n=num_large_m4, random_state=random_state)
next_train_ids, next_valid_ids = train_test_split(next_tv_ids, test_size=0.2, random_state=random_state)
mimic4_next_train_df = mimic4_df.loc[next_train_ids['ID']]
mimic4_next_valid_df = mimic4_df.loc[next_valid_ids['ID']]
mimic4_next_train_df.to_csv(path_r/'m4_next_train.csv')
mimic4_next_valid_df.to_csv(path_r/'m4_next_valid.csv')

def generate_few_shot_datasets_next(num, ids, path_save):
    few_shot = ids.sample(n=num, random_state=random_state)['ID'].to_list()
    few_shot_train, few_shot_valid = train_test_split(few_shot, test_size=0.2, random_state=random_state)
    few_shot_train_df=mimic4_df.loc[few_shot_train].reset_index()
    few_shot_valid_df=mimic4_df.loc[few_shot_valid].reset_index()
    few_shot_train_df.to_csv(path_save/'m4_next_{}_train.csv'.format(num), index=False)
    few_shot_valid_df.to_csv(path_save/'m4_next_{}_valid.csv'.format(num), index=False)
    
next_tv_ids = tvt_ids.drop(next_test_ids.index)
generate_few_shot_datasets_next(num=100, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=250, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=500, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=1000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=2000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=3000, ids=next_tv_ids, path_save=path_r)

In [None]:
random_state = 3
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if not os.path.exists(path_r):
    os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset_next.csv', index_col='ID')

# Create Next tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Next test dataset 
next_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_next_test_df = mimic4_df.loc[next_test_ids['ID']]
mimic4_next_test_df.to_csv(path_r/'data_mimic4_next_test.csv')

# Normal next train & validation dataset 
next_tv_ids = tvt_ids.drop(next_test_ids.index).sample(n=num_large_m4, random_state=random_state)
next_train_ids, next_valid_ids = train_test_split(next_tv_ids, test_size=0.2, random_state=random_state)
mimic4_next_train_df = mimic4_df.loc[next_train_ids['ID']]
mimic4_next_valid_df = mimic4_df.loc[next_valid_ids['ID']]
mimic4_next_train_df.to_csv(path_r/'m4_next_train.csv')
mimic4_next_valid_df.to_csv(path_r/'m4_next_valid.csv')

def generate_few_shot_datasets_next(num, ids, path_save):
    few_shot = ids.sample(n=num, random_state=random_state)['ID'].to_list()
    few_shot_train, few_shot_valid = train_test_split(few_shot, test_size=0.2, random_state=random_state)
    few_shot_train_df=mimic4_df.loc[few_shot_train].reset_index()
    few_shot_valid_df=mimic4_df.loc[few_shot_valid].reset_index()
    few_shot_train_df.to_csv(path_save/'m4_next_{}_train.csv'.format(num), index=False)
    few_shot_valid_df.to_csv(path_save/'m4_next_{}_valid.csv'.format(num), index=False)
    
next_tv_ids = tvt_ids.drop(next_test_ids.index)
generate_few_shot_datasets_next(num=100, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=250, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=500, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=1000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=2000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=3000, ids=next_tv_ids, path_save=path_r)

In [None]:
random_state = 4
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if not os.path.exists(path_r):
    os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset_next.csv', index_col='ID')

# Create Next tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Next test dataset 
next_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_next_test_df = mimic4_df.loc[next_test_ids['ID']]
mimic4_next_test_df.to_csv(path_r/'data_mimic4_next_test.csv')

# Normal next train & validation dataset 
next_tv_ids = tvt_ids.drop(next_test_ids.index).sample(n=num_large_m4, random_state=random_state)
next_train_ids, next_valid_ids = train_test_split(next_tv_ids, test_size=0.2, random_state=random_state)
mimic4_next_train_df = mimic4_df.loc[next_train_ids['ID']]
mimic4_next_valid_df = mimic4_df.loc[next_valid_ids['ID']]
mimic4_next_train_df.to_csv(path_r/'m4_next_train.csv')
mimic4_next_valid_df.to_csv(path_r/'m4_next_valid.csv')

def generate_few_shot_datasets_next(num, ids, path_save):
    few_shot = ids.sample(n=num, random_state=random_state)['ID'].to_list()
    few_shot_train, few_shot_valid = train_test_split(few_shot, test_size=0.2, random_state=random_state)
    few_shot_train_df=mimic4_df.loc[few_shot_train].reset_index()
    few_shot_valid_df=mimic4_df.loc[few_shot_valid].reset_index()
    few_shot_train_df.to_csv(path_save/'m4_next_{}_train.csv'.format(num), index=False)
    few_shot_valid_df.to_csv(path_save/'m4_next_{}_valid.csv'.format(num), index=False)
    
next_tv_ids = tvt_ids.drop(next_test_ids.index)
generate_few_shot_datasets_next(num=100, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=250, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=500, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=1000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=2000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=3000, ids=next_tv_ids, path_save=path_r)

In [None]:
random_state = 5
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if not os.path.exists(path_r):
    os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset_next.csv', index_col='ID')

# Create Next tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Next test dataset 
next_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_next_test_df = mimic4_df.loc[next_test_ids['ID']]
mimic4_next_test_df.to_csv(path_r/'data_mimic4_next_test.csv')

# Normal next train & validation dataset 
next_tv_ids = tvt_ids.drop(next_test_ids.index).sample(n=num_large_m4, random_state=random_state)
next_train_ids, next_valid_ids = train_test_split(next_tv_ids, test_size=0.2, random_state=random_state)
mimic4_next_train_df = mimic4_df.loc[next_train_ids['ID']]
mimic4_next_valid_df = mimic4_df.loc[next_valid_ids['ID']]
mimic4_next_train_df.to_csv(path_r/'m4_next_train.csv')
mimic4_next_valid_df.to_csv(path_r/'m4_next_valid.csv')

def generate_few_shot_datasets_next(num, ids, path_save):
    few_shot = ids.sample(n=num, random_state=random_state)['ID'].to_list()
    few_shot_train, few_shot_valid = train_test_split(few_shot, test_size=0.2, random_state=random_state)
    few_shot_train_df=mimic4_df.loc[few_shot_train].reset_index()
    few_shot_valid_df=mimic4_df.loc[few_shot_valid].reset_index()
    few_shot_train_df.to_csv(path_save/'m4_next_{}_train.csv'.format(num), index=False)
    few_shot_valid_df.to_csv(path_save/'m4_next_{}_valid.csv'.format(num), index=False)
    
next_tv_ids = tvt_ids.drop(next_test_ids.index)
generate_few_shot_datasets_next(num=100, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=250, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=500, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=1000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=2000, ids=next_tv_ids, path_save=path_r)
generate_few_shot_datasets_next(num=3000, ids=next_tv_ids, path_save=path_r)