In [1]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pandas as pd
import os
import argparse

In [2]:
csv= '/home/mei/nas/docker/thesis/data/csv/'

In [3]:
def create_folder(parent_path, folder): # create folder if not exist
    folder_path = os.path.join(parent_path, folder)
    os.makedirs(folder_path, exist_ok=True)
    return folder_path

In [4]:
def shuffle_stays(stays, seed=9): # shuffle stays
    return shuffle(stays, random_state=seed)

In [5]:
def process_table(table_name, table, stays, folder_path):
    table = table.loc[stays].copy()
    file_path = os.path.join(folder_path, f'{table_name}.csv')
    table.to_csv(file_path)
    return table

In [6]:
def split_train_test(eICU_path, is_test=True, seed=9):
    
    labels = pd.read_csv(os.path.join(eICU_path, 'final_labels.csv')).set_index('patient')

    # Split data into train, validation, and test sets
    train, test = train_test_split(labels.index, test_size=0.15, random_state=seed)
    train, val = train_test_split(train, test_size=0.15/0.85, random_state=seed)

    print('==> Loading data for splitting...')
    # Load datasets
    timeseries = pd.read_csv(os.path.join(eICU_path, 'final_timeseries.csv'), nrows=999999 if is_test else None).set_index('patient')
    diagnoses = pd.read_csv(os.path.join(eICU_path, 'final_diagnoses.csv')).set_index('patient')
    flat_features = pd.read_csv(os.path.join(eICU_path, 'final_flat_drug.csv')).set_index('patient')

    
    # timeseries = timeseries.loc[timeseries.index.isin(flat_features.index)]
    # diagnoses = diagnoses.loc[diagnoses.index.isin(flat_features.index)]

    # Process and save partitions
    for partition_name, partition in zip(['train', 'val', 'test'], [train, val, test]):
        print(f'==> Preparing {partition_name} data...')
        folder_path = create_folder(eICU_path, partition_name)
        stays = shuffle_stays(partition, seed=seed)
        stays_path = os.path.join(folder_path, 'stays.txt')

        with open(stays_path, 'w') as f:
            for stay in stays:
                f.write(f"{stay}\n")

        for table_name, table in zip(['labels', 'flat', 'diagnoses', 'timeseries'], [labels, flat_features, diagnoses, timeseries]):
            process_table(table_name, table, stays, folder_path)
 
    print("\n==== Dataset Sizes ====")
    for partition_name, partition in zip(['train', 'val', 'test'], [train, val, test]):
        print(f"**{partition_name} set:**")
        print(f"- Labels: {labels.loc[labels.index.isin(partition)].shape[0]}")
        print(f"- Flat Features: {flat_features.loc[flat_features.index.isin(partition)].shape[0]}")
        print(f"- Diagnoses: {diagnoses.loc[diagnoses.index.isin(partition)].shape[0]}")
        print(f"- Time Series: {timeseries.loc[timeseries.index.isin(partition)].shape[0]}\n")
  
    print('==> Splitting complete!')
    return

In [7]:
split_train_test(csv, is_test=False)

==> Loading data for splitting...
==> Preparing train data...
==> Preparing val data...
==> Preparing test data...

==== Dataset Sizes ====
**train set:**
- Labels: 7015
- Flat Features: 7015
- Diagnoses: 7015
- Time Series: 168360

**val set:**
- Labels: 1504
- Flat Features: 1504
- Diagnoses: 1504
- Time Series: 36096

**test set:**
- Labels: 1504
- Flat Features: 1504
- Diagnoses: 1504
- Time Series: 36096

==> Splitting complete!
