In [1]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pandas as pd
import os

In [3]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'

In [4]:
def create_folder(base_path, partition_name):
    folder_path = os.path.join(base_path, partition_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    return folder_path

In [5]:
def shuffle_stays(stays, seed=9): # shuffle stays
    return shuffle(stays, random_state=seed)

In [6]:
def process_table(table_name, table, stays, folder_path):
    partition_table = table.loc[table.index.isin(stays)]
    partition_table.to_hdf(os.path.join(folder_path, f'{table_name}.h5'), key='df', mode='w')

In [7]:
risks = pd.read_hdf(os.path.join(hdf, 'final_risk_scores.h5')).set_index('patient')
timeseries = pd.read_hdf(os.path.join(hdf, 'final_timeseries_Marker.h5')).reset_index().set_index('patient')
diagnoses = pd.read_hdf(os.path.join(hdf, 'final_diagnoses.h5')).set_index('patient')
flat_features = pd.read_hdf(os.path.join(hdf, 'final_flat.h5')).set_index('patient')
drug = pd.read_hdf(os.path.join(hdf, 'final_drug.h5')).set_index('patient')

In [8]:
def split_train_test(eICU_path, risks,timeseries,diagnoses,flat_features,drug, seed=9):

    # Split data into train, validation, and test sets
    train, test = train_test_split(flat_features.index, test_size=0.15, random_state=seed)
    train, val = train_test_split(train, test_size=0.15/0.85, random_state=seed)

    print('==> Loading data for splitting...')
    # Load datasets


    # Process and save partitions
    for partition_name, partition in zip(['train', 'val', 'test'], [train, val, test]):
        print(f'==> Preparing {partition_name} data...')
        folder_path = create_folder(eICU_path, partition_name)
        stays = shuffle_stays(partition, seed=seed)
        stays_path = os.path.join(folder_path, 'stays.txt')
         
        with open(stays_path, 'w') as f:
            for stay in stays:
                f.write(f"{stay}\n")
        for table_name, table in zip(['risks', 'flat', 'diagnoses','drug', 'timeseries_marker'], [risks, flat_features, diagnoses, drug,timeseries]):
            process_table(table_name, table, stays, folder_path)
        print(f'==> {partition_name} data saved!\n')
        
    print("\n==== Dataset Sizes ====")
    for partition_name, partition in zip(['train', 'val', 'test'], [train, val, test]):
        print(f"**{partition_name} set:**")
        print(f"- Labels: {risks.loc[risks.index.isin(partition)].shape}")
        print(f"- Flat Features: {flat_features.loc[flat_features.index.isin(partition)].shape}")
        print(f"- Diagnoses: {diagnoses.loc[diagnoses.index.isin(partition)].shape}")
        print(f"- Drug: {drug.loc[drug.index.isin(partition)].shape}")
        print(f"- Time Series: {timeseries.loc[timeseries.index.isin(partition)].shape}\n")
  
    print('==> Splitting complete!')
    return

In [9]:
split_train_test(hdf, risks, timeseries, diagnoses, flat_features, drug)

==> Loading data for splitting...
==> Preparing train data...
==> train data saved!

==> Preparing val data...
==> val data saved!

==> Preparing test data...
==> test data saved!


==== Dataset Sizes ====
**train set:**
- Labels: (12650433, 8)
- Flat Features: (8188, 4)
- Diagnoses: (8188, 124)
- Drug: (8188, 100)
- Time Series: (12650433, 325)

**val set:**
- Labels: (2724929, 8)
- Flat Features: (1755, 4)
- Diagnoses: (1755, 124)
- Drug: (1755, 100)
- Time Series: (2724929, 325)

**test set:**
- Labels: (2719342, 8)
- Flat Features: (1755, 4)
- Diagnoses: (1755, 124)
- Drug: (1755, 100)
- Time Series: (2719342, 325)

==> Splitting complete!


In [10]:
risks

Unnamed: 0_level_0,time,gender,age,dischargeweight,unitdischargestatus,actualiculos,discharge_risk_category,risk_score
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
252784,1,1,56.0,75.7,0,2.0500,1,0.258208
252784,2,1,56.0,75.7,0,2.0500,1,0.258122
252784,3,1,56.0,75.7,0,2.0500,1,0.258035
252784,4,1,56.0,75.7,0,2.0500,1,0.257949
252784,5,1,56.0,75.7,0,2.0500,1,0.257863
...,...,...,...,...,...,...,...,...
3348105,1500,0,86.0,60.3,0,2.8256,2,0.300400
3348105,1501,0,86.0,60.3,0,2.8256,2,0.300300
3348105,1502,0,86.0,60.3,0,2.8256,2,0.300200
3348105,1503,0,86.0,60.3,0,2.8256,2,0.300100


In [11]:
timeseries

Unnamed: 0_level_0,time,-bands,-basos,-eos,-lymphs,-monos,-polys,24 h urine protein,24 h urine urea nitrogen,ALT (SGPT),...,sao2_marker,heartrate_marker,respiration_marker,cvp_marker,systemicsystolic_marker,systemicdiastolic_marker,systemicmean_marker,st1_marker,st2_marker,st3_marker
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
252784,1,0.5,0.066667,0.014286,0.135165,0.288095,0.5,0.5,0.5,0.004442,...,1,1,1,0,0,0,0,1,1,1
252784,2,0.5,0.066667,0.014286,0.135165,0.288095,0.5,0.5,0.5,0.004442,...,1,1,1,0,0,0,0,1,1,1
252784,3,0.5,0.066667,0.014286,0.135165,0.288095,0.5,0.5,0.5,0.004442,...,1,1,1,0,0,0,0,1,1,1
252784,4,0.5,0.066667,0.014286,0.135165,0.288095,0.5,0.5,0.5,0.004442,...,1,1,1,0,0,0,0,1,1,1
252784,5,0.5,0.066667,0.014286,0.135165,0.288095,0.5,0.5,0.5,0.004442,...,1,1,1,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3348105,1500,0.5,0.333333,0.095238,0.131868,0.309524,0.5,0.5,0.5,0.003273,...,0,0,0,0,0,0,0,0,0,0
3348105,1501,0.5,0.333333,0.095238,0.131868,0.309524,0.5,0.5,0.5,0.003273,...,0,0,0,0,0,0,0,0,0,0
3348105,1502,0.5,0.333333,0.095238,0.131868,0.309524,0.5,0.5,0.5,0.003273,...,0,0,0,0,0,0,0,0,0,0
3348105,1503,0.5,0.333333,0.095238,0.131868,0.309524,0.5,0.5,0.5,0.003273,...,0,0,0,0,0,0,0,0,0,0


In [12]:
flat_features

Unnamed: 0_level_0,gender,age,admissionweight,> 89
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
252784,1,0.565789,0.226936,0
253331,1,0.828947,0.422792,0
255112,1,0.513158,0.106561,0
258354,0,0.631579,0.223976,0
259414,1,0.894737,0.387272,0
...,...,...,...,...
3247116,0,0.513158,0.161322,0
3247421,1,0.605263,0.232856,0
3346588,0,0.763158,0.456339,0
3347496,0,0.565789,0.208189,0


In [13]:
diagnoses

Unnamed: 0_level_0,Cardiovascular (R),Cardiovascular (R)|AICD,Cardiovascular (R)|Angina,Cardiovascular (R)|Arrhythmias,Cardiovascular (R)|Arrhythmias|atrial fibrillation - chronic,Cardiovascular (R)|Arrhythmias|atrial fibrillation - intermittent,Cardiovascular (R)|Congestive Heart Failure,Cardiovascular (R)|Congestive Heart Failure|CHF,Cardiovascular (R)|Congestive Heart Failure|CHF - severity unknown,Cardiovascular (R)|Coronary Artery Bypass,...,"apacheadmissiondx_Rhythm disturbance (atrial, supraventricular)",apacheadmissiondx_Rhythm disturbance (conduction defect),apacheadmissiondx_Seizures (primary-no structural brain disease),"apacheadmissiondx_Sepsis, GI","apacheadmissiondx_Sepsis, cutaneous/soft tissue","apacheadmissiondx_Sepsis, other","apacheadmissiondx_Sepsis, pulmonary","apacheadmissiondx_Sepsis, renal/UTI (including bladder)","apacheadmissiondx_Sepsis, unknown","grouped_apacheadmissiondx_Overdose,"
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
252784,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
253331,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
255112,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
258354,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
259414,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3247116,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3247421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3346588,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3347496,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
drug

Unnamed: 0_level_0,ACETAMINOPHEN,ADVAIR DISKUS,ALBUTEROL,ALBUTEROL SULFATE,ALLOPURINOL,ALPRAZOLAM,AMBIEN,AMIODARONE HCL,AMLODIPINE BESILATE,ASPIR 81,...,TRAMADOL HCL,TRAZODONE HCL,TYLENOL,VITAMIN C,VITAMIN D,VITAMIN D3,WARFARIN SODIUM,XANAX,ZOCOR,ZOFRAN
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
252784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253331,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
255112,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
258354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
259414,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3247116,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3247421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3346588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3347496,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
