In [2]:
import pandas as pd
import os
from utils import *

In [5]:
def split_train_test(eICU_path, flat_features, diagnoses,
                     timeseries,timeseries_15,timeseries_30,timeseries_60,
                     risks     ,risks_15,     risks_30,     risks_60,
                     seed=9):

    # Split data into train, validation, and test sets
    train, test = train_test_split(flat_features.index, test_size=0.15, random_state=seed)
    train, val = train_test_split(train, test_size=0.15/0.85, random_state=seed)

    print('==> Loading data for splitting...')
    # Load datasets


    # Process and save partitions
    for partition_name, partition in zip(['train', 'val', 'test'], [train, val, test]):
        print(f'==> Preparing {partition_name} data...')
        folder_path = create_folder(eICU_path, partition_name)
        stays = shuffle_stays(partition, seed=seed)
        stays_path = os.path.join(folder_path, 'stays.txt')
         
        with open(stays_path, 'w') as f:
            for stay in stays:
                f.write(f"{stay}\n")
        for table_name, table in zip(['flat', 'diagnoses', 'timeseries','timeseries_15','timeseries_30','timeseries_60',
                                      'risks','risks_15','risks_30','risks_60'], 
                                     [ flat_features, diagnoses,
                                       timeseries,timeseries_15,timeseries_30,timeseries_60,
                                       risks,risks_15,risks_30,risks_60
                                       ]):
            
            process_table(table_name, table, stays, folder_path)
        print(f'==> {partition_name} data saved!\n')
        
    print("\n==== Dataset Sizes ====")
    for partition_name, partition in zip(['train', 'val', 'test'], [train, val, test]):
        print(f"**{partition_name} set:**")
        print(f"- Flat Features: {flat_features.loc[flat_features.index.isin(partition)].shape}")
        print(f"- Diagnoses: {diagnoses.loc[diagnoses.index.isin(partition)].shape}")
        
        print(f"- Time Series: {timeseries.loc[timeseries.index.isin(partition)].shape}")
        print(f"- Labels: {risks.loc[risks.index.isin(partition)].shape}\n")
        
        print(f"- Resample 15min Time Series: {timeseries_15.loc[timeseries_15.index.isin(partition)].shape}")
        print(f"- Resample 15min Labels: {risks_15.loc[risks_15.index.isin(partition)].shape}\n")
        
        print(f"- Resample 30min Time Series: {timeseries_30.loc[timeseries_30.index.isin(partition)].shape}")
        print(f"- Resample 30min Labels: {risks_30.loc[risks_30.index.isin(partition)].shape}\n")
        
        print(f"- Resample 60min Time Series: {timeseries_60.loc[timeseries_60.index.isin(partition)].shape}")
        print(f"- Resample 60min Labels: {risks_60.loc[risks_60.index.isin(partition)].shape}\n")

  
    print('==> Splitting complete!')
    return

def save_to_h5py(data_path, partition, table_name):
    import os, h5py, pandas as pd
    partition_path = os.path.join(data_path, partition)
    
    input_file = os.path.join(partition_path, f"{table_name}.h5")
    data = pd.read_hdf(input_file)
    
    output_file = os.path.join(partition_path, f"{table_name}_each_patient.h5")
    with h5py.File(output_file, 'w') as h5f:
        for patient_id, group in data.groupby('patient'):
            h5f.create_dataset(str(patient_id), data=group.values)
    print(f"Saved h5py for {partition}/{table_name}")

In [3]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'

flat_features = pd.read_hdf(os.path.join(hdf, 'final_flat_drug.h5')).set_index('patient')
diagnoses = pd.read_hdf(os.path.join(hdf, 'final_diagnoses_trim_level.h5')).set_index('patient')
timeseries = pd.read_hdf(os.path.join(hdf, 'final_timeseries.h5')).reset_index().set_index('patient')
risks = pd.read_hdf(os.path.join(hdf, 'final_risk_scores.h5')).set_index('patient')

timeseries_15 = pd.read_hdf(os.path.join(hdf, 'final_timeseries_15min.h5')).reset_index().set_index('patient')
risks_15 = pd.read_hdf(os.path.join(hdf, 'final_risk_scores_15min.h5')).set_index('patient')
timeseries_30 = pd.read_hdf(os.path.join(hdf, 'final_timeseries_30min.h5')).reset_index().set_index('patient')
risks_30 = pd.read_hdf(os.path.join(hdf, 'final_risk_scores_30min.h5')).set_index('patient')
timeseries_60 = pd.read_hdf(os.path.join(hdf, 'final_timeseries_60min.h5')).reset_index().set_index('patient')
risks_60 = pd.read_hdf(os.path.join(hdf, 'final_risk_scores_60min.h5')).set_index('patient')


In [6]:
split_train_test(hdf,flat_features, diagnoses,
                timeseries,timeseries_15,timeseries_30,timeseries_60,
                risks,risks_15,risks_30,risks_60
                 )

==> Loading data for splitting...
==> Preparing train data...
==> train data saved!

==> Preparing val data...
==> val data saved!

==> Preparing test data...
==> test data saved!


==== Dataset Sizes ====
**train set:**
- Flat Features: (2150, 104)
- Diagnoses: (17336, 3)
- Time Series: (3126284, 155)
- Labels: (3126284, 8)

- Resample 15min Time Series: (1044603, 155)
- Resample 15min Labels: (1044603, 8)

- Resample 30min Time Series: (524096, 155)
- Resample 30min Labels: (524096, 8)

- Resample 60min Time Series: (263825, 155)
- Resample 60min Labels: (263825, 8)

**val set:**
- Flat Features: (461, 104)
- Diagnoses: (3754, 3)
- Time Series: (692647, 155)
- Labels: (692647, 8)

- Resample 15min Time Series: (231414, 155)
- Resample 15min Labels: (231414, 8)

- Resample 30min Time Series: (116093, 155)
- Resample 30min Labels: (116093, 8)

- Resample 60min Time Series: (58432, 155)
- Resample 60min Labels: (58432, 8)

**test set:**
- Flat Features: (461, 104)
- Diagnoses: (3326, 3)

In [7]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'

TABLES = [
    'timeseries', 'timeseries_15', 'timeseries_30', 'timeseries_60',
    'risks',      'risks_15',      'risks_30',      'risks_60'
]
for partition in ['train', 'val', 'test']:
    for tbl in TABLES:
        save_to_h5py(hdf, partition, tbl)

Saved h5py for train/timeseries
Saved h5py for train/timeseries_15
Saved h5py for train/timeseries_30
Saved h5py for train/timeseries_60
Saved h5py for train/risks
Saved h5py for train/risks_15
Saved h5py for train/risks_30
Saved h5py for train/risks_60
Saved h5py for val/timeseries
Saved h5py for val/timeseries_15
Saved h5py for val/timeseries_30
Saved h5py for val/timeseries_60
Saved h5py for val/risks
Saved h5py for val/risks_15
Saved h5py for val/risks_30
Saved h5py for val/risks_60
Saved h5py for test/timeseries
Saved h5py for test/timeseries_15
Saved h5py for test/timeseries_30
Saved h5py for test/timeseries_60
Saved h5py for test/risks
Saved h5py for test/risks_15
Saved h5py for test/risks_30
Saved h5py for test/risks_60


In [8]:
risks

Unnamed: 0_level_0,time,gender,age,dischargeweight,unitdischargestatus,discharge_risk_category,actualiculos,risk_score
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
260132,1,0,89.0,84.1,1,3,0.1819,0.989578
260132,2,0,89.0,84.1,1,3,0.1819,0.989812
260132,3,0,89.0,84.1,1,3,0.1819,0.990046
260132,4,0,89.0,84.1,1,3,0.1819,0.990280
260132,5,0,89.0,84.1,1,3,0.1819,0.990514
...,...,...,...,...,...,...,...,...
3247116,2328,0,52.0,68.0,1,3,8.1256,0.999041
3247116,2329,0,52.0,68.0,1,3,8.1256,0.999281
3247116,2330,0,52.0,68.0,1,3,8.1256,0.999520
3247116,2331,0,52.0,68.0,1,3,8.1256,0.999760


In [9]:
timeseries

Unnamed: 0_level_0,time,-bands,-basos,-eos,-lymphs,-monos,-polys,24 h urine protein,24 h urine urea nitrogen,ALT (SGPT),...,sao2,heartrate,respiration,cvp,systemicsystolic,systemicdiastolic,systemicmean,st1,st2,st3
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260132,1,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
260132,2,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
260132,3,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
260132,4,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
260132,5,0.5,0.1,0.027778,0.167059,0.121429,0.828358,0.5,0.5,0.50000,...,0.914286,0.289474,0.509434,0.600000,0.095808,0.063636,0.058394,0.5,0.015778,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3247116,2328,0.5,0.0,0.000000,0.023529,0.142857,0.500000,0.5,0.5,0.01897,...,0.890286,0.602807,0.443774,0.854000,0.581317,0.396000,0.463358,0.5,0.500000,0.5
3247116,2329,0.5,0.0,0.000000,0.023529,0.142857,0.500000,0.5,0.5,0.01897,...,0.890476,0.601974,0.444182,0.854167,0.581587,0.397348,0.464416,0.5,0.500000,0.5
3247116,2330,0.5,0.0,0.000000,0.023529,0.142857,0.500000,0.5,0.5,0.01897,...,0.889441,0.601449,0.432322,0.854348,0.576412,0.394466,0.459854,0.5,0.500000,0.5
3247116,2331,0.5,0.0,0.000000,0.023529,0.142857,0.500000,0.5,0.5,0.01897,...,0.888312,0.600080,0.433962,0.852273,0.570495,0.391322,0.455209,0.5,0.500000,0.5


In [10]:
flat_features

Unnamed: 0_level_0,gender,age,admissionweight,larger_than_89,acetaminophen,advair_diskus,albuterol,albuterol_sulfate,allopurinol,alprazolam,...,tramadol_hcl,trazodone_hcl,tylenol,vancomycin,ventolin_hfa,vicodin,vitamin_c,vitamin_d,warfarin_sodium,zofran
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260132,0,89.0,59.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
261021,0,89.0,51.7,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
270853,1,46.0,134.4,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
282833,0,84.0,51.3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
306940,0,71.0,81.6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3246409,1,77.0,83.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3246443,1,82.0,76.7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3246620,1,66.0,106.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3246731,1,59.0,96.3,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [11]:
diagnoses

Unnamed: 0_level_0,diagnosisstring,first,second
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
260132,Endocrine (R)/Hypothyroidism,Endocrine (R),Hypothyroidism
261021,Cardiovascular (R)/Pacemaker,Cardiovascular (R),Pacemaker
261021,Pulmonary/Asthma,Pulmonary,Asthma
261021,Pulmonary/COPD,Pulmonary,COPD
261021,Pulmonary/Asthma,Pulmonary,Asthma
...,...,...,...
3247116,Pulmonary/Respiratory Failure,Pulmonary,Respiratory Failure
3247116,Cardiovascular (R)/Hypertension Requiring Tre...,Cardiovascular (R),Hypertension Requiring Treatment
3247116,Pulmonary/Home Oxygen,Pulmonary,Home Oxygen
3247116,Pulmonary/COPD,Pulmonary,COPD


In [12]:
hdf= '/home/mei/nas/docker/thesis/data/hdf/'

risk_test = pd.read_hdf(os.path.join(hdf, 'test/risks_60.h5'))
risk_test

Unnamed: 0_level_0,time,gender,age,dischargeweight,unitdischargestatus,discharge_risk_category,actualiculos,risk_score
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
320647,1,1,61.0,84.1,0,2,5.1069,0.409812
320647,2,1,61.0,84.1,0,2,5.1069,0.409735
320647,3,1,61.0,84.1,0,2,5.1069,0.409658
320647,4,1,61.0,84.1,0,2,5.1069,0.409580
320647,5,1,61.0,84.1,0,2,5.1069,0.409503
...,...,...,...,...,...,...,...,...
3244909,67,1,63.0,83.9,0,1,1.9756,0.200450
3244909,68,1,63.0,83.9,0,1,1.9756,0.200337
3244909,69,1,63.0,83.9,0,1,1.9756,0.200225
3244909,70,1,63.0,83.9,0,1,1.9756,0.200112


In [13]:
flat_test = pd.read_hdf(os.path.join(hdf, 'test/flat.h5'))
flat_test

Unnamed: 0_level_0,gender,age,admissionweight,larger_than_89,acetaminophen,advair_diskus,albuterol,albuterol_sulfate,allopurinol,alprazolam,...,tramadol_hcl,trazodone_hcl,tylenol,vancomycin,ventolin_hfa,vicodin,vitamin_c,vitamin_d,warfarin_sodium,zofran
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
320647,1,61.0,108.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
426638,1,51.0,108.8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
430027,0,89.0,75.2,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
445889,0,60.0,83.6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
446339,1,77.0,90.2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3237396,1,60.0,93.1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3239059,0,76.0,113.4,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3240660,0,56.0,70.2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3240896,0,69.0,100.8,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
diagnoses_test = pd.read_hdf(os.path.join(hdf, 'test/diagnoses.h5'))
diagnoses_test

Unnamed: 0_level_0,diagnosisstring,first,second
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
320647,Cardiovascular (R)/Myocardial Infarction,Cardiovascular (R),Myocardial Infarction
320647,Cardiovascular (R)/Congestive Heart Failure,Cardiovascular (R),Congestive Heart Failure
320647,Cardiovascular (R)/Valve disease,Cardiovascular (R),Valve disease
426638,Cardiovascular (R)/Arrhythmias,Cardiovascular (R),Arrhythmias
426638,Cardiovascular (R)/Arrhythmias,Cardiovascular (R),Arrhythmias
...,...,...,...
3240896,Endocrine (R)/Insulin Dependent Diabetes,Endocrine (R),Insulin Dependent Diabetes
3240896,Endocrine (R)/Hypothyroidism,Endocrine (R),Hypothyroidism
3240896,Cardiovascular (R)/Hypertension Requiring Tre...,Cardiovascular (R),Hypertension Requiring Treatment
3244909,Endocrine (R)/Insulin Dependent Diabetes,Endocrine (R),Insulin Dependent Diabetes


In [17]:
import h5py
hdf= '/home/mei/nas/docker/thesis/data/hdf/'

risk_train  = h5py.File(os.path.join(hdf, 'train/risks_each_patient.h5'), 'r')
risk_val = h5py.File(os.path.join(hdf, 'val/risks_each_patient.h5'), 'r')
risk_test = h5py.File(os.path.join(hdf, 'test/risks_each_patient.h5'), 'r')


risk_train_15 = h5py.File(os.path.join(hdf, 'train/risks_15_each_patient.h5'), 'r')
risk_val_15 = h5py.File(os.path.join(hdf, 'val/risks_15_each_patient.h5'), 'r')
risk_test_15 = h5py.File(os.path.join(hdf, 'test/risks_15_each_patient.h5'), 'r')

risk_train_30 = h5py.File(os.path.join(hdf, 'train/risks_30_each_patient.h5'), 'r')
risk_val_30 = h5py.File(os.path.join(hdf, 'val/risks_30_each_patient.h5'), 'r')
risk_test_30 = h5py.File(os.path.join(hdf, 'test/risks_30_each_patient.h5'), 'r')

risk_train_60 = h5py.File(os.path.join(hdf, 'train/risks_60_each_patient.h5'), 'r')
risk_val_60 = h5py.File(os.path.join(hdf, 'val/risks_60_each_patient.h5'), 'r')
risk_test_60 = h5py.File(os.path.join(hdf, 'test/risks_60_each_patient.h5'), 'r')

In [23]:
import pandas as pd

def read_h5_file_to_df(h5_file, columns):
    all_data = []
    for pid in h5_file.keys():
        data = h5_file[pid][:]
        df = pd.DataFrame(data, columns=columns)
        df["patient"] = int(pid)
        all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

def summarize_by_risk(df, name=""):
    grouped = df.groupby("discharge_risk_category")
    summary = grouped.agg(
        patient_count=("patient", lambda x: x.nunique()),
        record_count=("patient", "count")
    ).sort_index()

    print(f"\ Summary for {name.upper()}:")
    print(summary)
    return summary

COLUMNS = ['time', 'gender', 'age', 'dischargeweight', 'unitdischargestatus',
           'discharge_risk_category', 'actualiculos', 'risk_score']

In [24]:
df_train = read_h5_file_to_df(risk_train, COLUMNS)
df_val = read_h5_file_to_df(risk_val, COLUMNS)
df_test = read_h5_file_to_df(risk_test, COLUMNS)

summarize_by_risk(df_train, "train")
summarize_by_risk(df_val, "val")
summarize_by_risk(df_test, "test")

\ Summary for TRAIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                166         81986
1.0                               1303       2227418
2.0                                356        552741
3.0                                325        264139
\ Summary for VAL:
                         patient_count  record_count
discharge_risk_category                             
0.0                                 37         18841
1.0                                280        479214
2.0                                 87        138928
3.0                                 57         55664
\ Summary for TEST:
                         patient_count  record_count
discharge_risk_category                             
0.0                                 27         12505
1.0                                294        477276
2.0                                 89        135537
3.0                                 51 

Unnamed: 0_level_0,patient_count,record_count
discharge_risk_category,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,27,12505
1.0,294,477276
2.0,89,135537
3.0,51,42182


In [25]:
train_15 = read_h5_file_to_df(risk_train_15, COLUMNS)
val_15 = read_h5_file_to_df(risk_val_15, COLUMNS)
test_15 = read_h5_file_to_df(risk_test_15, COLUMNS)
summarize_by_risk(train_15, "train 15min")
summarize_by_risk(val_15, "val 15min")
summarize_by_risk(test_15, "test 15min")

\ Summary for TRAIN 15MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                166         27523
1.0                               1303        743981
2.0                                356        184684
3.0                                325         88415
\ Summary for VAL 15MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                 37          6324
1.0                                280        160052
2.0                                 87         46419
3.0                                 57         18619
\ Summary for TEST 15MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                 27          4199
1.0                                294        159419
2.0                                 89         45285
3.0                  

Unnamed: 0_level_0,patient_count,record_count
discharge_risk_category,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,27,4199
1.0,294,159419
2.0,89,45285
3.0,51,14121


In [26]:
train_30= read_h5_file_to_df(risk_train_30, COLUMNS)
val_30 = read_h5_file_to_df(risk_val_30, COLUMNS)
test_30 = read_h5_file_to_df(risk_test_30, COLUMNS)
summarize_by_risk(train_30, "train 30min")
summarize_by_risk(val_30, "val 30min")
summarize_by_risk(test_30, "test 30min")


\ Summary for TRAIN 30MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                166         13902
1.0                               1303        373069
2.0                                356         92644
3.0                                325         44481
\ Summary for VAL 30MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                 37          3194
1.0                                280         80252
2.0                                 87         23289
3.0                                 57          9358
\ Summary for TEST 30MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                 27          2120
1.0                                294         79951
2.0                                 89         22714
3.0                  

Unnamed: 0_level_0,patient_count,record_count
discharge_risk_category,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,27,2120
1.0,294,79951
2.0,89,22714
3.0,51,7106


In [27]:
train_60 = read_h5_file_to_df(risk_train_60, COLUMNS)
val_60 = read_h5_file_to_df(risk_val_60, COLUMNS)
test_60 = read_h5_file_to_df(risk_test_60, COLUMNS)
summarize_by_risk(train_60, "train 60min")
summarize_by_risk(val_60, "val 60min")
summarize_by_risk(test_60, "test 60min")

\ Summary for TRAIN 60MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                166          7092
1.0                               1303        187598
2.0                                356         46634
3.0                                325         22501
\ Summary for VAL 60MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                 37          1627
1.0                                280         40354
2.0                                 87         11722
3.0                                 57          4729
\ Summary for TEST 60MIN:
                         patient_count  record_count
discharge_risk_category                             
0.0                                 27          1080
1.0                                294         40209
2.0                                 89         11430
3.0                  

Unnamed: 0_level_0,patient_count,record_count
discharge_risk_category,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,27,1080
1.0,294,40209
2.0,89,11430
3.0,51,3596
