In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import train_test_split

In [13]:
model_data = pd.read_parquet('../data/model_data.parquet')

Time for the crucial part, splitting data. Images from the same patient should be in the same set. Also, the label distribution should be proportional. After doing some research I found a perfect package for this kind of job. Iterative-stratification, which will allow us to do it without unnecessarily complicated functions.

In [14]:
# First of all - let's group the data by 'Patient ID' and add all necessary columns. Let's assume that patients with multiple images have the state of their lungsconstant over time, 
# so we can use the first image as a representative of the patient - it may not rflect reality all the time, but it's adequate assumption in this case. We don't want to overcomplicate the splitting more than necessary.


# Group by 'Patient ID' and aggregate the data
grouped_data = (model_data.sort_values(['Patient ID', 'Follow-up #'])
                .groupby('Patient ID')
                .first()
                .reset_index())

print(grouped_data.head())

   Patient ID          image_id  Follow-up #  Patient Age  Patient Gender  \
0           1  00000001_000.png            0           58               1   
1           2  00000002_000.png            0           81               1   
2           3  00000003_000.png            0           81               0   
3           4  00000004_000.png            0           82               1   
4           5  00000005_000.png            0           69               0   

   View Position  Atelectasis  Cardiomegaly  Effusion  Infiltration  ...  \
0              0            0             1         0             0  ...   
1              0            0             0         0             0  ...   
2              0            0             0         0             0  ...   
3              1            0             0         0             0  ...   
4              0            0             0         0             0  ...   

   Nodule  Pneumonia  Pneumothorax  Consolidation  Edema  Emphysema  Fibrosis  \

In [25]:

finding_vals = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 
                'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia', 
                 'No Finding']



x = grouped_data['Patient ID']
y = grouped_data[finding_vals].values

MSKF = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=99)

train_idx, test_val_idx = next(MSKF.split(x, y))

train_patients = x.iloc[train_idx]
test_val_patients = x.iloc[test_val_idx]




In [26]:
# Now we can create a train/validation split from the training set. The same method will be used to ensure that the label distribution is proportional and that patients with multiple images are in the same set

x_test_val = grouped_data['Patient ID'].iloc[test_val_idx]
y_test_val = grouped_data[finding_vals].iloc[test_val_idx].values

MSKF_2 = MultilabelStratifiedKFold(n_splits=2, shuffle=True, random_state=99)

val_idx, test_idx = next(MSKF_2.split(x_test_val, y_test_val))

val_patients = x_test_val.iloc[val_idx]
test_patients = x_test_val.iloc[test_idx]




In [27]:
train_df = model_data[model_data['Patient ID'].isin(train_patients)].copy()
val_df = model_data[model_data['Patient ID'].isin(val_patients)].copy()
test_df = model_data[model_data['Patient ID'].isin(test_patients)].copy()

In [34]:
# Let's see if the MSKF worked as expected
def show_split_distribution(df):
    return df[finding_vals].mean()


ver_dict = {}
ver_dict['Train'] = show_split_distribution(train_df)
ver_dict['Validation'] = show_split_distribution(val_df)
ver_dict['Test'] = show_split_distribution(test_df)

print(ver_dict)

# We can see that the label distribution is relatively proportional in all sets - there are some differences in some categories, but it's expected, since they are rare

{'Train': Atelectasis           0.104324
Cardiomegaly          0.024244
Effusion              0.118942
Infiltration          0.176752
Mass                  0.050462
Nodule                0.054580
Pneumonia             0.012688
Pneumothorax          0.045470
Consolidation         0.041510
Edema                 0.020699
Emphysema             0.022292
Fibrosis              0.015168
Pleural_Thickening    0.029932
Hernia                0.002053
No Finding            0.541902
dtype: float64, 'Validation': Atelectasis           0.099095
Cardiomegaly          0.029026
Effusion              0.120843
Infiltration          0.177033
Mass                  0.057460
Nodule                0.062114
Pneumonia             0.013201
Pneumothorax          0.058475
Consolidation         0.043751
Edema                 0.021494
Emphysema             0.026064
Fibrosis              0.014894
Pleural_Thickening    0.033850
Hernia                0.001185
No Finding            0.523314
dtype: float64, 'Test': Atelec

In [None]:
# Now let's remove Patient ID and Follow-up # columns, since we don't need them anymore. We will use 'image_id' as an unique ID for each sample
cols_to_remove = ['Patient ID', 'Follow-up #']

def remove_cols(df, cols):
    df = df.drop(columns=cols)
    return df

train_df = remove_cols(train_df, cols_to_remove)
val_df = remove_cols(val_df, cols_to_remove)
test_df = remove_cols(test_df, cols_to_remove)



Now it's time for normalization. We want to do on the train dataset, after splitting data - to prevent data leakage, we don't want the information from val/test set in our model during training

In [41]:

normalization_params = {"min_age": train_df['Patient Age'].min(),
                        "max_age": train_df['Patient Age'].max()}

train_df['PatientAge_norm'] = (train_df['Patient Age'] - normalization_params["min_age"]) / (normalization_params["max_age"] - normalization_params["min_age"])
train_df.drop(columns=['Patient Age'], inplace=True)


print(train_df.head())

           image_id  Patient Gender  View Position  Atelectasis  Cardiomegaly  \
0  00000001_000.png               1              0            0             1   
1  00000001_001.png               1              0            0             1   
2  00000001_002.png               1              0            0             1   
3  00000002_000.png               1              0            0             0   
4  00000003_000.png               0              0            0             0   

   Effusion  Infiltration  Mass  Nodule  Pneumonia  Pneumothorax  \
0         0             0     0       0          0             0   
1         0             0     0       0          0             0   
2         1             0     0       0          0             0   
3         0             0     0       0          0             0   
4         0             0     0       0          0             0   

   Consolidation  Edema  Emphysema  Fibrosis  Pleural_Thickening  Hernia  \
0              0      0     

In [42]:
val_df['PatientAge_norm'] = (val_df['Patient Age'] - normalization_params["min_age"]) / (normalization_params["max_age"] - normalization_params["min_age"])
val_df.drop(columns=['Patient Age'], inplace=True)

test_df['PatientAge_norm'] = (test_df['Patient Age'] - normalization_params["min_age"]) / (normalization_params["max_age"] - normalization_params["min_age"])
test_df.drop(columns=['Patient Age'], inplace=True)

Everything is ready, now we can export the datasets into separate files. <br>
In the next notebook we will work with images

In [44]:
train_df.to_parquet('../data/train_df.parquet')

val_df.to_parquet('../data/val_df.parquet')

test_df.to_parquet('../data/test_df.parquet')