In [1]:
df_path = 'PATH/TO/RAW/DATA/data.csv'
df_output_path = None
seed=42
train_size = 0.7
val_to_test_ratio = 0.5
print(f'Loading csv from {df_path} and will write csvs to {df_output_path}.')
print(f'Using random seed: {seed}.\n'
      f'Aiming to have {train_size*100:.1f}% of patients in the train set,\n'
      f'{(1-train_size)*val_to_test_ratio*100:.1f}% of patients in the validation set and\n'
      f'{(1-train_size)*(1-val_to_test_ratio)*100:.1f}% of patients in the held out test set.\n')

import numpy as np
np.random.seed(seed)
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path

df_path = Path(df_path)
df = pd.read_csv(df_path)
disease_cols = ['AMD', 'RVO', 'Gla', 'MH', 'DR', 'RD', 'RP', 'AO', 'DM']

# order disease cols by occurence, starting with least frequent
disease_cols = df[disease_cols].sum().sort_values().index.values
# exclude AO (only 21 eyes) and ignore DM (almost perfect co-occurence with DR)
df = df.loc[df.AO==0]
disease_cols = disease_cols[~np.isin(disease_cols, ['AO', 'DM'])]

### STRATIFIED SPLIT ON PATIENT-LEVEL
# stratify column: 
# - if patient has any diease, we take the most frequent disease
# - if there is a tie, we take the disease that is rarest across the data
# - if and only if there is no disease in the patient, 
# -- we take healthy as that patient's stratify value
patient_stratify = df.groupby('ID')[disease_cols].sum().idxmax(axis=1)
healthy_mask = (df.groupby('ID')[disease_cols].sum().sum(axis=1) == 0)
patient_stratify[healthy_mask] = 'healthy' 

# split
print(f'Including the following diseases: {disease_cols}.\nSplitting.\n')
ids = patient_stratify.index.values
stratify_col = patient_stratify.values
ids_train, ids_valtest, strat_train, strat_valtest = train_test_split(ids, stratify_col,
                                                                      stratify=stratify_col,
                                                                      train_size=train_size, 
                                                                      random_state=seed)

ids_val, ids_test, strat_val, strat_test = train_test_split(ids_valtest, strat_valtest,
                                                            stratify=strat_valtest,
                                                            train_size=val_to_test_ratio, 
                                                            random_state=seed)

print(f'{"Overall:":15}{ids.size} patients\n'\
      f'{"Train:":15}{ids_train.size:4} patients ({ids_train.size/ids.size*100:.2f}%)\n'
      f'{"Validation:":15}{ids_val.size:4} patients ({ids_val.size/ids.size*100:.2f}%)\n'
      f'{"Test:":15}{ids_test.size:4} patients ({ids_test.size/ids.size*100:.2f}%)\n')

df_train = df.loc[df.ID.isin(ids_train)]
df_val = df.loc[df.ID.isin(ids_val)]
df_test = df.loc[df.ID.isin(ids_test)]

print(f'{"Overall:":15}{df.shape[0]} images\n'\
      f'{"Train:":15}{df_train.shape[0]:4} images ({df_train.shape[0]/df.shape[0]*100:.2f}%)\n'
      f'{"Validation:":15}{df_val.shape[0]:4} images ({df_val.shape[0]/df.shape[0]*100:.2f}%)\n'
      f'{"Test:":15}{df_test.shape[0]:4} images ({df_test.shape[0]/df.shape[0]*100:.2f}%)\n')

if df_output_path is not None:
  df_output_path = Path(df_output_path)
  print('Writing dataframes for each set.\n')
  df_train.to_csv(df_output_path/'df_train.csv')
  df_val.to_csv(df_output_path/'df_val.csv')
  df_test.to_csv(df_output_path/'df_test.csv')

print('Distribution of diseases on patient level for train set:')
print(pd.Series(strat_train).value_counts()/strat_train.size*100)
print('Distribution of diseases on patient level for validation set:')
print(pd.Series(strat_val).value_counts()/strat_val.size*100)

Loading csv from PATH/TO/RAW/DATA/data.csv and will write csvs to None.
Using random seed: 42.
Aiming to have 70.0% of patients in the train set,
15.0% of patients in the validation set and
15.0% of patients in the held out test set.

Including the following diseases: ['MH' 'RP' 'AMD' 'RVO' 'RD' 'Gla' 'DR'].
Splitting.

Overall:       5376 patients
Train:         3763 patients (70.00%)
Validation:     806 patients (14.99%)
Test:           807 patients (15.01%)

Overall:       13026 images
Train:         9121 images (70.02%)
Validation:    1911 images (14.67%)
Test:          1994 images (15.31%)

Distribution of diseases on patient level for train set:
healthy    43.183630
Gla        17.539197
DR         12.676056
RVO         8.158384
RD          7.759766
AMD         5.314908
MH          3.321818
RP          2.046240
dtype: float64
Distribution of diseases on patient level for validation set:
healthy    43.176179
Gla        17.493797
DR         12.655087
RVO         8.064516
RD         

In [2]:
df_comp = pd.DataFrame()

df_comp['TOP Dataset'] = pd.Series(stratify_col).value_counts()
df_comp['Train Set'] = pd.Series(strat_train).value_counts()
df_comp['Validation Set'] = pd.Series(strat_val).value_counts()
df_comp['Test Set'] = pd.Series(strat_test).value_counts()

df_comp_percent=df_comp/df_comp.sum(0)*100
comp_totals = df_comp.sum(0)

tuple_to_str_func = lambda a,b: f'{str(int(a)):<4} ({b:.1f}%)'
series_elementwise_func = lambda a,b: [tuple_to_str_func(x,y) for x,y in zip(a,b)]
df_comp = df_comp.astype(int).combine(df_comp_percent, series_elementwise_func)

df_comp.loc['Total Patients'] = comp_totals

df_comp=df_comp.T
df_comp.columns = ['Healthy', 'Gla', 'DR', 'RVO', 'RD', 'AMD', 'MH', 'RP', 'Total Patients']

df_comp.T

Unnamed: 0,TOP Dataset,Train Set,Validation Set,Test Set
Healthy,2322 (43.2%),1625 (43.2%),348 (43.2%),349 (43.2%)
Gla,943 (17.5%),660 (17.5%),141 (17.5%),142 (17.6%)
DR,682 (12.7%),477 (12.7%),102 (12.7%),103 (12.8%)
RVO,438 (8.1%),307 (8.2%),65 (8.1%),66 (8.2%)
RD,417 (7.8%),292 (7.8%),63 (7.8%),62 (7.7%)
AMD,285 (5.3%),200 (5.3%),43 (5.3%),42 (5.2%)
MH,179 (3.3%),125 (3.3%),27 (3.3%),27 (3.3%)
RP,110 (2.0%),77 (2.0%),17 (2.1%),16 (2.0%)
Total Patients,5376,3763,806,807


In [3]:
dfs = [df, df_train, df_val, df_test]

df_comp['Total Images (N)'] = [_.shape[0] for _ in dfs]
df_comp['Patient Age (Mean±Std)'] = [f"{_.groupby('ID')['age'].mean().mean():.2f} ± {_.groupby('ID')['age'].mean().std():.2f}" for _ in dfs]
df_comp['Female Patients (N, %)'] = [f"{int((_.groupby('ID')['sex'].max()=='F').sum())} / {(_.groupby('ID')['sex'].max()=='F').mean()*100:.0f}%" for _ in dfs]

df_comp.T

Unnamed: 0,TOP Dataset,Train Set,Validation Set,Test Set
Healthy,2322 (43.2%),1625 (43.2%),348 (43.2%),349 (43.2%)
Gla,943 (17.5%),660 (17.5%),141 (17.5%),142 (17.6%)
DR,682 (12.7%),477 (12.7%),102 (12.7%),103 (12.8%)
RVO,438 (8.1%),307 (8.2%),65 (8.1%),66 (8.2%)
RD,417 (7.8%),292 (7.8%),63 (7.8%),62 (7.7%)
AMD,285 (5.3%),200 (5.3%),43 (5.3%),42 (5.2%)
MH,179 (3.3%),125 (3.3%),27 (3.3%),27 (3.3%)
RP,110 (2.0%),77 (2.0%),17 (2.1%),16 (2.0%)
Total Patients,5376,3763,806,807
Total Images (N),13026,9121,1911,1994
