In [1]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pandas as pd
import os

In [2]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'

In [3]:
def create_folder(base_path, partition_name):
    folder_path = os.path.join(base_path, partition_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    return folder_path

In [9]:
def shuffle_stays(stays, seed=9): # shuffle stays
    return shuffle(stays, random_state=seed)

In [5]:
def process_table(table_name, table, stays, folder_path):
    partition_table = table.loc[table.index.isin(stays)]
    partition_table.to_hdf(os.path.join(folder_path, f'{table_name}.h5'), key='df', mode='w')

In [6]:
risks = pd.read_hdf(os.path.join(hdf, 'final_risk_scores.h5')).set_index('patient')
timeseries = pd.read_hdf(os.path.join(hdf, 'final_timeseries.h5')).reset_index().set_index('patient')
diagnoses = pd.read_hdf(os.path.join(hdf, 'final_diagnoses.h5')).set_index('patient')
flat_features = pd.read_hdf(os.path.join(hdf, 'final_flat.h5')).set_index('patient')

In [12]:
def split_train_test(eICU_path, risks,timeseries,diagnoses,flat_features, seed=9):

    # Split data into train, validation, and test sets
    train, test = train_test_split(flat_features.index, test_size=0.15, random_state=seed)
    train, val = train_test_split(train, test_size=0.15/0.85, random_state=seed)

    print('==> Loading data for splitting...')
    # Load datasets


    # Process and save partitions
    for partition_name, partition in zip(['train', 'val', 'test'], [train, val, test]):
        print(f'==> Preparing {partition_name} data...')
        folder_path = create_folder(eICU_path, partition_name)
        stays = shuffle_stays(partition, seed=seed)
        stays_path = os.path.join(folder_path, 'stays.txt')
         
        with open(stays_path, 'w') as f:
            for stay in stays:
                f.write(f"{stay}\n")
        for table_name, table in zip(['risks', 'flat', 'diagnoses', 'timeseries'], [risks, flat_features, diagnoses, timeseries]):
            process_table(table_name, table, stays, folder_path)
        print(f'==> {partition_name} data saved!\n')
        
    print("\n==== Dataset Sizes ====")
    for partition_name, partition in zip(['train', 'val', 'test'], [train, val, test]):
        print(f"**{partition_name} set:**")
        print(f"- Labels: {risks.loc[risks.index.isin(partition)].shape}")
        print(f"- Flat Features: {flat_features.loc[flat_features.index.isin(partition)].shape}")
        print(f"- Diagnoses: {diagnoses.loc[diagnoses.index.isin(partition)].shape}")
        print(f"- Time Series: {timeseries.loc[timeseries.index.isin(partition)].shape}\n")
  
    print('==> Splitting complete!')
    return

In [13]:
split_train_test(hdf, risks, timeseries, diagnoses, flat_features)

==> Loading data for splitting...
==> Preparing train data...
==> train data saved!

==> Preparing val data...
==> val data saved!

==> Preparing test data...
==> test data saved!


==== Dataset Sizes ====
**train set:**
- Labels: (12649957, 3)
- Flat Features: (8188, 104)
- Diagnoses: (8188, 124)
- Time Series: (12649957, 163)

**val set:**
- Labels: (2724855, 3)
- Flat Features: (1755, 104)
- Diagnoses: (1755, 124)
- Time Series: (2724855, 163)

**test set:**
- Labels: (2719329, 3)
- Flat Features: (1755, 104)
- Diagnoses: (1755, 124)
- Time Series: (2719329, 163)

==> Splitting complete!
