In [1]:
import os
import pathlib
import shutil
from datetime import timedelta

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

path_proj = pathlib.Path(os.getcwd()).parents[1]

In [2]:
path_mimic4 = path_proj/'data/mimic4'
path_processed = path_proj/'data/mimic4/processed'

In [3]:
mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset.csv', index_col='ID')

all_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

adm = pd.read_csv(path_mimic4/'raw/core/admissions.csv.gz', compression='gzip')

In [4]:
# Mortality label

tvt_mortality_labels = adm[adm['hadm_id'].isin(mimic4_df.index)][['hadm_id', 'hospital_expire_flag']].rename(columns={'hadm_id': 'ID', 'hospital_expire_flag': 'labels'})

tvt_mortality_labels.reset_index(drop=True).to_csv(path_processed/'mortality_labels.csv', index=False)

In [None]:
# Length label

hosp_stay = adm[['hadm_id', 'admittime', 'dischtime', 'hospital_expire_flag']]

hosp_stay['admittime']=pd.to_datetime(hosp_stay['admittime'], format='%Y-%m-%d %H:%M:%S')
hosp_stay['dischtime']=pd.to_datetime(hosp_stay['dischtime'], format='%Y-%m-%d %H:%M:%S')

hosp_stay['stayed_hour'] = ((hosp_stay['dischtime'] - hosp_stay['admittime']) / pd.Timedelta(hours=1)).apply(np.ceil)

hosp_stay_survived = hosp_stay[hosp_stay['hospital_expire_flag']!=1]

tvt_los_labels = hosp_stay_survived.rename(columns={'hadm_id': 'ID'})[['ID', 'stayed_hour']]

tvt_los_labels.reset_index(drop=True).to_csv(path_processed/'length_labels.csv', index=False)

In [5]:
random_state = 1
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if os.path.exists(path_r):
    shutil.rmtree(path_r)
os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset.csv', index_col='ID')

# Create Mortality tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Mortality test dataset 
mortality_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_mortality_test_df = mimic4_df.loc[mortality_test_ids['ID']]
mimic4_mortality_test_df.to_csv(path_r/'data_mimic4_mortality_test.csv')

# Normal mortality train & validation dataset 
mortality_tv_ids = tvt_ids.drop(mortality_test_ids.index).sample(n=num_large_m4, random_state=random_state)
mortality_train_ids, mortality_valid_ids = train_test_split(mortality_tv_ids, test_size=0.2, random_state=random_state)
mimic4_mortality_train_df = mimic4_df.loc[mortality_train_ids['ID']]
mimic4_mortality_valid_df = mimic4_df.loc[mortality_valid_ids['ID']]
mimic4_mortality_train_df.to_csv(path_r/'m4_mortality_train.csv')
mimic4_mortality_valid_df.to_csv(path_r/'m4_mortality_valid.csv')

# Balanced small datasets
death_ids = adm[adm['hospital_expire_flag']==1]['hadm_id']
survived_ids = adm[adm['hospital_expire_flag']!=1]['hadm_id']
tv_ids_death = mortality_tv_ids[mortality_tv_ids['ID'].isin(death_ids)]
tv_ids_survived = mortality_tv_ids[mortality_tv_ids['ID'].isin(survived_ids)]

def generate_balanced_small_datasets_mortality(num, ids_positive, ids_negative, path_save):
    small_pos = ids_positive.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small_neg = ids_negative.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small = small_pos + small_neg
    small_train, small_valid = train_test_split(small, test_size=0.2, random_state=random_state)
    small_train_df=mimic4_df.loc[small_train].reset_index()
    small_valid_df=mimic4_df.loc[small_valid].reset_index()
    small_train_df.to_csv(path_save/'m4_mortality_{}_train.csv'.format(num), index=False)
    small_valid_df.to_csv(path_save/'m4_mortality_{}_valid.csv'.format(num), index=False)

generate_balanced_small_datasets_mortality(num=100, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=250, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=500, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=1000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=2000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=3000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)

In [7]:
random_state = 2
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if os.path.exists(path_r):
    shutil.rmtree(path_r)
os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset.csv', index_col='ID')

# Create Mortality tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Mortality test dataset 
mortality_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_mortality_test_df = mimic4_df.loc[mortality_test_ids['ID']]
mimic4_mortality_test_df.to_csv(path_r/'data_mimic4_mortality_test.csv')

# Normal mortality train & validation dataset 
mortality_tv_ids = tvt_ids.drop(mortality_test_ids.index).sample(n=num_large_m4, random_state=random_state)
mortality_train_ids, mortality_valid_ids = train_test_split(mortality_tv_ids, test_size=0.2, random_state=random_state)
mimic4_mortality_train_df = mimic4_df.loc[mortality_train_ids['ID']]
mimic4_mortality_valid_df = mimic4_df.loc[mortality_valid_ids['ID']]
mimic4_mortality_train_df.to_csv(path_r/'m4_mortality_train.csv')
mimic4_mortality_valid_df.to_csv(path_r/'m4_mortality_valid.csv')

# Balanced small datasets
death_ids = adm[adm['hospital_expire_flag']==1]['hadm_id']
survived_ids = adm[adm['hospital_expire_flag']!=1]['hadm_id']
tv_ids_death = mortality_tv_ids[mortality_tv_ids['ID'].isin(death_ids)]
tv_ids_survived = mortality_tv_ids[mortality_tv_ids['ID'].isin(survived_ids)]

def generate_balanced_small_datasets_mortality(num, ids_positive, ids_negative, path_save):
    small_pos = ids_positive.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small_neg = ids_negative.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small = small_pos + small_neg
    small_train, small_valid = train_test_split(small, test_size=0.2, random_state=random_state)
    small_train_df=mimic4_df.loc[small_train].reset_index()
    small_valid_df=mimic4_df.loc[small_valid].reset_index()
    small_train_df.to_csv(path_save/'m4_mortality_{}_train.csv'.format(num), index=False)
    small_valid_df.to_csv(path_save/'m4_mortality_{}_valid.csv'.format(num), index=False)

generate_balanced_small_datasets_mortality(num=100, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=250, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=500, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=1000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=2000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=3000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)

In [5]:
random_state = 3
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if os.path.exists(path_r):
    shutil.rmtree(path_r)
os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset.csv', index_col='ID')

# Create Mortality tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Mortality test dataset 
mortality_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_mortality_test_df = mimic4_df.loc[mortality_test_ids['ID']]
mimic4_mortality_test_df.to_csv(path_r/'data_mimic4_mortality_test.csv')

# Normal mortality train & validation dataset 
mortality_tv_ids = tvt_ids.drop(mortality_test_ids.index).sample(n=num_large_m4, random_state=random_state)
mortality_train_ids, mortality_valid_ids = train_test_split(mortality_tv_ids, test_size=0.2, random_state=random_state)
mimic4_mortality_train_df = mimic4_df.loc[mortality_train_ids['ID']]
mimic4_mortality_valid_df = mimic4_df.loc[mortality_valid_ids['ID']]
mimic4_mortality_train_df.to_csv(path_r/'m4_mortality_train.csv')
mimic4_mortality_valid_df.to_csv(path_r/'m4_mortality_valid.csv')

# Balanced small datasets
death_ids = adm[adm['hospital_expire_flag']==1]['hadm_id']
survived_ids = adm[adm['hospital_expire_flag']!=1]['hadm_id']
tv_ids_death = mortality_tv_ids[mortality_tv_ids['ID'].isin(death_ids)]
tv_ids_survived = mortality_tv_ids[mortality_tv_ids['ID'].isin(survived_ids)]

def generate_balanced_small_datasets_mortality(num, ids_positive, ids_negative, path_save):
    small_pos = ids_positive.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small_neg = ids_negative.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small = small_pos + small_neg
    small_train, small_valid = train_test_split(small, test_size=0.2, random_state=random_state)
    small_train_df=mimic4_df.loc[small_train].reset_index()
    small_valid_df=mimic4_df.loc[small_valid].reset_index()
    small_train_df.to_csv(path_save/'m4_mortality_{}_train.csv'.format(num), index=False)
    small_valid_df.to_csv(path_save/'m4_mortality_{}_valid.csv'.format(num), index=False)

generate_balanced_small_datasets_mortality(num=100, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=250, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=500, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=1000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=2000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=3000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)

In [9]:
random_state = 4
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if os.path.exists(path_r):
    shutil.rmtree(path_r)
os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset.csv', index_col='ID')

# Create Mortality tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Mortality test dataset 
mortality_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_mortality_test_df = mimic4_df.loc[mortality_test_ids['ID']]
mimic4_mortality_test_df.to_csv(path_r/'data_mimic4_mortality_test.csv')

# Normal mortality train & validation dataset 
mortality_tv_ids = tvt_ids.drop(mortality_test_ids.index).sample(n=num_large_m4, random_state=random_state)
mortality_train_ids, mortality_valid_ids = train_test_split(mortality_tv_ids, test_size=0.2, random_state=random_state)
mimic4_mortality_train_df = mimic4_df.loc[mortality_train_ids['ID']]
mimic4_mortality_valid_df = mimic4_df.loc[mortality_valid_ids['ID']]
mimic4_mortality_train_df.to_csv(path_r/'m4_mortality_train.csv')
mimic4_mortality_valid_df.to_csv(path_r/'m4_mortality_valid.csv')

# Balanced small datasets
death_ids = adm[adm['hospital_expire_flag']==1]['hadm_id']
survived_ids = adm[adm['hospital_expire_flag']!=1]['hadm_id']
tv_ids_death = mortality_tv_ids[mortality_tv_ids['ID'].isin(death_ids)]
tv_ids_survived = mortality_tv_ids[mortality_tv_ids['ID'].isin(survived_ids)]

def generate_balanced_small_datasets_mortality(num, ids_positive, ids_negative, path_save):
    small_pos = ids_positive.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small_neg = ids_negative.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small = small_pos + small_neg
    small_train, small_valid = train_test_split(small, test_size=0.2, random_state=random_state)
    small_train_df=mimic4_df.loc[small_train].reset_index()
    small_valid_df=mimic4_df.loc[small_valid].reset_index()
    small_train_df.to_csv(path_save/'m4_mortality_{}_train.csv'.format(num), index=False)
    small_valid_df.to_csv(path_save/'m4_mortality_{}_valid.csv'.format(num), index=False)

generate_balanced_small_datasets_mortality(num=100, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=250, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=500, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=1000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=2000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=3000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)

In [10]:
random_state = 5
num_test_samples = 2000
num_large_m4 = 20000

path_r = path_processed/("r"+str(random_state))
if os.path.exists(path_r):
    shutil.rmtree(path_r)
os.mkdir(path_r)

mimic4_df = pd.read_csv(path_processed/'mimic4_full_dataset.csv', index_col='ID')

# Create Mortality tvt datasets
tvt_ids = pd.DataFrame(mimic4_df.index.unique(), columns=['ID'])

# Mortality test dataset 
mortality_test_ids = tvt_ids.sample(n=num_test_samples, random_state=random_state)
mimic4_mortality_test_df = mimic4_df.loc[mortality_test_ids['ID']]
mimic4_mortality_test_df.to_csv(path_r/'data_mimic4_mortality_test.csv')

# Normal mortality train & validation dataset 
mortality_tv_ids = tvt_ids.drop(mortality_test_ids.index).sample(n=num_large_m4, random_state=random_state)
mortality_train_ids, mortality_valid_ids = train_test_split(mortality_tv_ids, test_size=0.2, random_state=random_state)
mimic4_mortality_train_df = mimic4_df.loc[mortality_train_ids['ID']]
mimic4_mortality_valid_df = mimic4_df.loc[mortality_valid_ids['ID']]
mimic4_mortality_train_df.to_csv(path_r/'m4_mortality_train.csv')
mimic4_mortality_valid_df.to_csv(path_r/'m4_mortality_valid.csv')

# Balanced small datasets
death_ids = adm[adm['hospital_expire_flag']==1]['hadm_id']
survived_ids = adm[adm['hospital_expire_flag']!=1]['hadm_id']
tv_ids_death = mortality_tv_ids[mortality_tv_ids['ID'].isin(death_ids)]
tv_ids_survived = mortality_tv_ids[mortality_tv_ids['ID'].isin(survived_ids)]

def generate_balanced_small_datasets_mortality(num, ids_positive, ids_negative, path_save):
    small_pos = ids_positive.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small_neg = ids_negative.sample(n=num//2, random_state=random_state)['ID'].to_list()
    small = small_pos + small_neg
    small_train, small_valid = train_test_split(small, test_size=0.2, random_state=random_state)
    small_train_df=mimic4_df.loc[small_train].reset_index()
    small_valid_df=mimic4_df.loc[small_valid].reset_index()
    small_train_df.to_csv(path_save/'m4_mortality_{}_train.csv'.format(num), index=False)
    small_valid_df.to_csv(path_save/'m4_mortality_{}_valid.csv'.format(num), index=False)

generate_balanced_small_datasets_mortality(num=100, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=250, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=500, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=1000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=2000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)
generate_balanced_small_datasets_mortality(num=3000, ids_positive=tv_ids_death, ids_negative=tv_ids_survived, path_save=path_r)