In [1]:
import os
import pathlib
from datetime import timedelta

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

p_project = str(pathlib.Path(os.getcwd()).parents[1])
random_state = 4

In [2]:
path_temp = p_project + '/data/mimic4'
processed_balanced = path_temp + '/processed_balanced_r4'

In [3]:
mimic4_df = pd.read_csv(processed_balanced + '/mimic4_full_dataset.csv', index_col='ID')

all_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

In [4]:
mimic4_df.head()

Unnamed: 0_level_0,Time,Value_label_0,Mask_label_0,Value_label_2,Mask_label_2,Value_label_5,Mask_label_5,Value_label_6,Mask_label_6,Value_label_8,...,Value_label_82,Mask_label_82,Value_label_4,Mask_label_4,Value_label_94,Mask_label_94,Value_label_95,Mask_label_95,Value_label_92,Mask_label_92
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20000147,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
20000147,578,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
20000147,599,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
20000147,693,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
20000147,720,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0


In [4]:
# Pretraining dataset

In [5]:
pretraining_ids = all_ids.sample(n=40000, random_state=random_state)

mimic4_pretraining_df = mimic4_df.loc[pretraining_ids['ID'].unique()]

mimic4_pretraining_df.to_csv(processed_balanced + '/data_mimic4_pretraining_40k_r{}.csv'.format(str(random_state)))

In [6]:
tvt_ids = all_ids.drop(pretraining_ids.index)

tvt_ids.to_csv(processed_balanced + '/tvt_ids.csv')

In [7]:
# Mortality dataset

In [9]:
tvt_ids = pd.read_csv(processed_balanced + '/tvt_ids.csv', index_col=0)

adm = pd.read_csv(path_data + '/core/admissions.csv.gz', compression='gzip')

tvt_mortality_labels = adm[adm['hadm_id'].isin(tvt_ids['ID'])][['hadm_id', 'hospital_expire_flag']].rename(columns={'hadm_id': 'ID', 'hospital_expire_flag': 'labels'})

tvt_mortality_labels.reset_index(drop=True).to_csv(processed_balanced + '/mortality_labels.csv')

In [10]:
mortality_test_ids = tvt_ids.sample(n=1000, random_state=random_state)

mimic4_mortality_test_df = mimic4_df.loc[mortality_test_ids['ID']]

mimic4_mortality_test_df.to_csv(processed_balanced + '/data_mimic4_mortality_test.csv')

In [11]:
mortality_tv_ids = tvt_ids.drop(mortality_test_ids.index)

death_ids = adm[adm['hospital_expire_flag']==1]['hadm_id']

survived_ids = adm[adm['hospital_expire_flag']!=1]['hadm_id']

tv_ids_death = mortality_tv_ids[mortality_tv_ids['ID'].isin(death_ids)]

tv_ids_survived = mortality_tv_ids[mortality_tv_ids['ID'].isin(survived_ids)]

In [12]:
def generate_balanced_few_shot_datasets_mortality(num, ids_positive, ids_negative, path_save):
    few_shot_pos = ids_positive.sample(n=num//2, random_state=random_state)['ID'].to_list()
    few_shot_neg = ids_negative.sample(n=num//2, random_state=random_state)['ID'].to_list()
    few_shot = few_shot_pos + few_shot_neg
    few_shot_train, few_shot_valid = train_test_split(few_shot, test_size=0.2, random_state=random_state)
    df_train = pd.DataFrame(few_shot_train, columns =['ID'])
    df_valid = pd.DataFrame(few_shot_valid, columns =['ID'])
    df_train.to_csv(path_save + '/m4_mortality_train_{}.csv'.format(num))
    df_valid.to_csv(path_save + '/m4_mortality_valid_{}.csv'.format(num))
    few_shot_train_df=mimic4_df.loc[few_shot_train].reset_index()
    few_shot_valid_df=mimic4_df.loc[few_shot_valid].reset_index()
    few_shot_train_df.to_csv(path_save + '/m4_mortality_{}_train.csv'.format(num), index=False)
    few_shot_valid_df.to_csv(path_save + '/m4_mortality_{}_valid.csv'.format(num), index=False)

In [13]:
generate_balanced_few_shot_datasets_mortality(num=100, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=processed_balanced)
generate_balanced_few_shot_datasets_mortality(num=250, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=processed_balanced)
generate_balanced_few_shot_datasets_mortality(num=500, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=processed_balanced)
generate_balanced_few_shot_datasets_mortality(num=1000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=processed_balanced)
generate_balanced_few_shot_datasets_mortality(num=2000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=processed_balanced)
generate_balanced_few_shot_datasets_mortality(num=3000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=processed_balanced)

In [14]:
# Length dataset

In [15]:
tvt_ids = pd.read_csv(processed_balanced + '/tvt_ids.csv')[['ID']]

adm = pd.read_csv(path_data + '/core/admissions.csv.gz', compression='gzip')

hosp_stay = adm[['hadm_id', 'admittime', 'dischtime', 'hospital_expire_flag']]

hosp_stay['admittime']=pd.to_datetime(hosp_stay['admittime'], format='%Y-%m-%d %H:%M:%S')
hosp_stay['dischtime']=pd.to_datetime(hosp_stay['dischtime'], format='%Y-%m-%d %H:%M:%S')

hosp_stay['stayed_hour'] = ((hosp_stay['dischtime'] - hosp_stay['admittime']) / pd.Timedelta(hours=1)).apply(np.ceil)

hosp_stay_survived = hosp_stay[hosp_stay['hospital_expire_flag']!=1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hosp_stay['admittime']=pd.to_datetime(hosp_stay['admittime'], format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hosp_stay['dischtime']=pd.to_datetime(hosp_stay['dischtime'], format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hosp_stay['stayed_hour'] 

In [16]:
tvt_los_labels = hosp_stay_survived[hosp_stay_survived['hadm_id'].isin(tvt_ids['ID'])].rename(columns={'hadm_id': 'ID'})[['ID', 'stayed_hour']]

tvt_los_labels.reset_index(drop=True).to_csv(processed_balanced + '/length_labels.csv')

tvt_ids_survived = tvt_los_labels[['ID']]

In [17]:
length_test_ids = tvt_ids_survived.sample(n=1000, random_state=random_state)

mimic4_length_test_df = mimic4_df.loc[length_test_ids['ID']]

mimic4_length_test_df.to_csv(processed_balanced + '/data_mimic4_length_test.csv')

In [18]:
length_tv_ids = tvt_ids_survived.drop(length_test_ids.index)

In [19]:
def generate_few_shot_datasets_length(num, ids, path_save):
    few_shot = ids.sample(n=num, random_state=random_state)['ID'].to_list()
    few_shot_train, few_shot_valid = train_test_split(few_shot, test_size=0.2, random_state=random_state)
    df_train = pd.DataFrame(few_shot_train, columns =['ID'])
    df_valid = pd.DataFrame(few_shot_valid, columns =['ID'])
    df_train.to_csv(path_save + '/m4_length_train_{}.csv'.format(num))
    df_valid.to_csv(path_save + '/m4_length_valid_{}.csv'.format(num))
    few_shot_train_df=mimic4_df.loc[few_shot_train].reset_index()
    few_shot_valid_df=mimic4_df.loc[few_shot_valid].reset_index()
    few_shot_train_df.to_csv(path_save + '/m4_length_{}_train.csv'.format(num), index=False)
    few_shot_valid_df.to_csv(path_save + '/m4_length_{}_valid.csv'.format(num), index=False)

In [20]:
generate_few_shot_datasets_length(num=100, ids=length_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_length(num=250, ids=length_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_length(num=500, ids=length_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_length(num=1000, ids=length_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_length(num=2000, ids=length_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_length(num=3000, ids=length_tv_ids, path_save=processed_balanced)

In [21]:
# Next3 dataset

In [4]:
tvt_ids = pd.read_csv(processed_balanced + '/tvt_ids.csv')[['ID']]

lab_df = pd.read_csv(path_temp + '/processed/tables/lab_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'valuenum', 'label']]
inputs_df = pd.read_csv(path_temp + '/processed/tables/inputs_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'amount', 'label']]
outputs_df = pd.read_csv(path_temp + '/processed/tables/outputs_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'value', 'label']]
presc_df = pd.read_csv(path_temp + '/processed/tables/prescriptions_processed.csv')[
    ['subject_id', 'hadm_id', 'charttime', 'dose_val_rx', 'drug']]

lab_df = lab_df[lab_df['hadm_id'].isin(tvt_ids['ID'])]
inputs_df = inputs_df[inputs_df['hadm_id'].isin(tvt_ids['ID'])]
outputs_df = outputs_df[outputs_df['hadm_id'].isin(tvt_ids['ID'])]
presc_df = presc_df[presc_df['hadm_id'].isin(tvt_ids['ID'])]

# Change the name of amount. Valuenum for every table
inputs_df['valuenum'] = inputs_df['amount']
inputs_df = inputs_df.drop(columns=['amount']).copy()

outputs_df['valuenum'] = outputs_df['value']
outputs_df = outputs_df.drop(columns=['value']).copy()

presc_df['valuenum'] = presc_df['dose_val_rx']
presc_df = presc_df.drop(columns=['dose_val_rx']).copy()
presc_df['label'] = presc_df['drug']
presc_df = presc_df.drop(columns=['drug']).copy()

# Tag to distinguish between lab and inputs events
inputs_df['Origin'] = 'Inputs'
lab_df['Origin'] = 'Lab'
outputs_df['Origin'] = 'Outputs'
presc_df['Origin'] = 'Prescriptions'

# merge both dfs.
merged_df1 = (inputs_df.append(lab_df)).reset_index()
merged_df2 = (merged_df1.append(outputs_df)).reset_index()
merged_df2.drop(columns='level_0', inplace=True)
merged_df = (merged_df2.append(presc_df)).reset_index()

# Check that all labels have different names.
assert(merged_df['label'].nunique() == (inputs_df['label'].nunique(
)+lab_df['label'].nunique()+outputs_df['label'].nunique()+presc_df['label'].nunique()))

# set the timestamp as the time delta between the first chart time for each admission
merged_df['charttime'] = pd.to_datetime(
    merged_df['charttime'], format='%Y-%m-%d %H:%M:%S')
ref_time = merged_df.groupby('hadm_id')['charttime'].min()
merged_df_1 = pd.merge(ref_time.to_frame(name='ref_time'),
                       merged_df, left_index=True, right_on='hadm_id')
merged_df_1['time_stamp'] = merged_df_1['charttime']-merged_df_1['ref_time']
assert(len(merged_df_1.loc[merged_df_1['time_stamp']
       < timedelta(hours=0)].index) == 0)

# Create a label code (int) for the labels.
label_dict = dict(zip(list(merged_df_1['label'].unique()), range(
    len(list(merged_df_1['label'].unique())))))
merged_df_1['label_code'] = merged_df_1['label'].map(label_dict)

  inputs_df = pd.read_csv(path_temp + '/processed/tables/inputs_processed.csv')[
  presc_df = pd.read_csv(path_temp + '/processed/tables/prescriptions_processed.csv')[
  merged_df1 = (inputs_df.append(lab_df)).reset_index()
  merged_df2 = (merged_df1.append(outputs_df)).reset_index()
  merged_df = (merged_df2.append(presc_df)).reset_index()


In [5]:
merged_df_short = merged_df_1[['hadm_id', 'valuenum', 'time_stamp', 'label_code', 'Origin']].rename(
    columns={'hadm_id': 'ID', 'time_stamp': 'Time'})

# Make sure that the selected admissions have observations after 24 hours
ids_selected = merged_df_short[merged_df_short['Time'] > timedelta(hours=24)]['ID'].unique()
# select only values within first 48 hours
merged_df_short = merged_df_short[merged_df_short['ID'].isin(ids_selected)].loc[(merged_df_short['Time'] < timedelta(hours=48))]

merged_df_short['Time'] = merged_df_short['Time'].dt.total_seconds().div(60).astype(int)
assert(len(merged_df_short.loc[merged_df_short['Time'] > 2880].index) == 0)

# drop columns that are not needed for final dataset
merged_df_short.drop(['Origin'], axis=1, inplace=True)
complete_df = merged_df_short

# create value- and mask- columns and fill with data
labels = complete_df['label_code'].unique()
value_columns = []
mask_columns = []
for num in labels:
    name = 'Value_label_' + str(num)
    name2 = 'Mask_label_' + str(num)
    value_columns.append(name)
    mask_columns.append(name2)
    complete_df[name] = 0.0
    complete_df[name2] = 0
    
complete_df.dropna(inplace=True)

for index, row in complete_df.iterrows():
    name = 'Value_label_' + str(row['label_code'].astype(int))
    name2 = 'Mask_label_' + str(row['label_code'].astype(int))
    complete_df.at[index, name] = row['valuenum']
    complete_df.at[index, name2] = 1

# drop all unneccesary columns and do sanity check
complete_df.drop(['valuenum', 'label_code'], axis=1, inplace=True)

# If there are multiple values for the same time stamp, take the maximum
complete_df_gb = complete_df.groupby(['ID', 'Time'], as_index=False).max()

for x in mask_columns:
    assert(len(complete_df_gb.loc[complete_df_gb[x] > 1]) == 0)
complete_df_gb['ID'] = complete_df_gb['ID'].astype(int)

  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  complete_df[name2] = 0
  complete_df[name] = 0.0
  com

In [6]:
complete_df_gb.set_index(['ID'], inplace=True)
tvt_ids = tvt_ids[tvt_ids['ID'].isin(complete_df_gb.index)]
next3_test_ids = tvt_ids.sample(n=1000, random_state=random_state)
mimic4_next3_test_df = complete_df_gb.loc[next3_test_ids['ID']]
mimic4_next3_test_df.to_csv(processed_balanced + '/data_mimic4_next3_test.csv')

In [7]:
def generate_few_shot_datasets_next3(num, ids, path_save):
    few_shot = ids.sample(n=num, random_state=random_state)['ID'].to_list()
    few_shot_train, few_shot_valid = train_test_split(few_shot, test_size=0.2, random_state=random_state)
    df_train = pd.DataFrame(few_shot_train, columns =['ID'])
    df_valid = pd.DataFrame(few_shot_valid, columns =['ID'])
    df_train.to_csv(path_save + '/m4_next3_train_{}.csv'.format(num))
    df_valid.to_csv(path_save + '/m4_next3_valid_{}.csv'.format(num))
    few_shot_train_df=complete_df_gb.loc[few_shot_train].reset_index()
    few_shot_valid_df=complete_df_gb.loc[few_shot_valid].reset_index()
    few_shot_train_df.to_csv(path_save + '/m4_next3_{}_train.csv'.format(num), index=False)
    few_shot_valid_df.to_csv(path_save + '/m4_next3_{}_valid.csv'.format(num), index=False)

In [8]:
next3_tv_ids = tvt_ids.drop(next3_test_ids.index)
generate_few_shot_datasets_next3(num=100, ids=next3_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_next3(num=250, ids=next3_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_next3(num=500, ids=next3_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_next3(num=1000, ids=next3_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_next3(num=2000, ids=next3_tv_ids, path_save=processed_balanced)
generate_few_shot_datasets_next3(num=3000, ids=next3_tv_ids, path_save=processed_balanced)