In [1]:
import os
import pandas as pd
import numpy as np
import math
import h5py
import sys
import cv2
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data_to_split = pd.read_csv('train_val_metadata.csv')

In [4]:
number_of_folds = 5
folds = []

In [5]:
# findig the images for the patients
patient_images = pd.read_csv('txt/train_val_COVIDx_CT-3A.csv', sep=' ',header=0)
full_set = pd.DataFrame(columns=data_to_split.columns.values.tolist())
data_to_split['images'] = ' '
full_set['images'] = ' '

In [6]:
# matching images to specific patient IDs
for i, row in data_to_split.iterrows():
    mask = patient_images['fname'].str.match(pat='{}[^A-Za-z0-9]'.format(row['patient id']))
    images = np.array(patient_images[mask])
    if images.size == 0:
        continue
    else:
        full_set.loc[i] = row
        full_set.at[i,'images'] = images[:,0]

In [7]:
full_set.reset_index(drop=True, inplace=True)

In [8]:
image_count = 0
for i in range(0, len(full_set)):
    image_count = image_count + full_set['images'][i].size

In [9]:
# creating folds for training the model
already_sampled = pd.DataFrame(columns=full_set.columns.values.tolist())
split_image_count = 0
for i in range(0,number_of_folds):
    print("Now at fold: " + str(i))
    training_set = pd.DataFrame(columns=full_set.columns.values.tolist())
    val_set = pd.DataFrame(columns=full_set.columns.values.tolist())
    
    training_set = full_set

    j = 0
    if(i < 3):
        while(j < int(len(full_set)/number_of_folds)):
            sample = training_set.sample(n=1)
            if sample['patient id'].isin(already_sampled['patient id']).values[0]:
                continue
            else:
                val_set = pd.concat([val_set,sample])
                already_sampled = pd.concat([already_sampled,sample])
                training_set = training_set.drop(sample.index, inplace=False)
                split_image_count = split_image_count + sample.iloc[0]['images'].size
                j = j + 1
    else:
        while(j < math.ceil(len(full_set)/number_of_folds)):
            sample = training_set.sample(n=1)
            if sample['patient id'].isin(already_sampled['patient id']).values[0]:
                continue
            else:
                val_set = pd.concat([val_set,sample])
                already_sampled = pd.concat([already_sampled,sample])
                training_set = training_set.drop(sample.index, inplace=False)
                split_image_count = split_image_count + sample.iloc[0]['images'].size
                j = j + 1

    
    temp = []
    temp.append(training_set)
    temp.append(val_set)
    folds.append(temp)
    print("Already sampled: " + str(len(already_sampled)))
    print("Images added to splits: " + str(split_image_count))

Now at fold: 0
Already sampled: 902
Images added to splits: 74872
Now at fold: 1
Already sampled: 1804
Images added to splits: 149524
Now at fold: 2
Already sampled: 2706
Images added to splits: 224397
Now at fold: 3
Already sampled: 3609
Images added to splits: 298676
Now at fold: 4
Already sampled: 4512
Images added to splits: 381596


In [10]:
# Checking if everything is sampled
split_image_count == image_count

True

In [11]:
# -----------------------------------------
# writing folds to disk for later reference
# -----------------------------------------

In [11]:
# open .txt file and write TRAINING data to txt file
for i in range(0,len(folds)):
    with open('train_' + str(i) + '.txt', 'w+') as f:
        for item in folds[i][0].iterrows():
            f.write('%s\n' %item[1]['finding'])
            f.write('%s\n' %item[1]['images'])
# close the file
f.close()

In [12]:
# open .txt file and write VALIDATION data to txt file
for i in range(0,len(folds)):
    with open('validation_' + str(i) + '.txt', 'w+') as f:
        for item in folds[i][1].iterrows():
            f.write('%s\n' %item[1]['finding'])
            f.write('%s\n' %item[1]['images'])
# close the file
f.close()

In [13]:
# creating the HDF5 file
with h5py.File('COVID-classification.hdf5','w') as h5f:
    
    # folds
    hdf5_fold = h5f.create_group("folds")

    # groups
    group_1 = h5f.get("folds").create_group("1")
    group_2 = h5f.get("folds").create_group("2")
    group_3 = h5f.get("folds").create_group("3")
    group_4 = h5f.get("folds").create_group("4")
    group_5 = h5f.get("folds").create_group("5")

    # train and val folders
    h5f.get("folds/1").create_group("train")
    h5f.get("folds/1").create_group("val")
    
    h5f.get("folds/2").create_group("train")
    h5f.get("folds/2").create_group("val")
    
    h5f.get("folds/3").create_group("train")
    h5f.get("folds/3").create_group("val")
    
    h5f.get("folds/4").create_group("train")
    h5f.get("folds/4").create_group("val")
    
    h5f.get("folds/5").create_group("train")
    h5f.get("folds/5").create_group("val")
    
    # train classes
    h5f.get("folds/1/train").create_group("Normal")
    h5f.get("folds/1/train").create_group("Pneumonia")
    h5f.get("folds/1/train").create_group("COVID-19")
    
    h5f.get("folds/2/train").create_group("Normal")
    h5f.get("folds/2/train").create_group("Pneumonia")
    h5f.get("folds/2/train").create_group("COVID-19")

    h5f.get("folds/3/train").create_group("Normal")
    h5f.get("folds/3/train").create_group("Pneumonia")
    h5f.get("folds/3/train").create_group("COVID-19")

    h5f.get("folds/4/train").create_group("Normal")
    h5f.get("folds/4/train").create_group("Pneumonia")
    h5f.get("folds/4/train").create_group("COVID-19")

    h5f.get("folds/5/train").create_group("Normal")
    h5f.get("folds/5/train").create_group("Pneumonia")
    h5f.get("folds/5/train").create_group("COVID-19")

    # val classes
    h5f.get("folds/1/val").create_group("Normal")
    h5f.get("folds/1/val").create_group("Pneumonia")
    h5f.get("folds/1/val").create_group("COVID-19")

    h5f.get("folds/2/val").create_group("Normal")
    h5f.get("folds/2/val").create_group("Pneumonia")
    h5f.get("folds/2/val").create_group("COVID-19")

    h5f.get("folds/3/val").create_group("Normal")
    h5f.get("folds/3/val").create_group("Pneumonia")
    h5f.get("folds/3/val").create_group("COVID-19")

    h5f.get("folds/4/val").create_group("Normal")
    h5f.get("folds/4/val").create_group("Pneumonia")
    h5f.get("folds/4/val").create_group("COVID-19")

    h5f.get("folds/5/val").create_group("Normal")
    h5f.get("folds/5/val").create_group("Pneumonia")
    h5f.get("folds/5/val").create_group("COVID-19")
 
    h5f.get("folds").create_group("test")
    h5f.get("folds/test").create_group("Normal")
    h5f.get("folds/test").create_group("Pneumonia")
    h5f.get("folds/test").create_group("COVID-19")
    
    h5f.close()

In [None]:
# loading TRAINING and VALIDATION images into HDF5 file
# -----------------------------------------------------

In [14]:
src = './images/3A_images/'

# loading the TRAINING images into the HDF5 file (images are NOT resized)
# -----------------------------------------------------------------------

for i in range(1, len(folds) + 1):
    with h5py.File('COVID-classification.hdf5','r+') as h5f:
        
        target = h5f.get("/folds/" + str(i) + "/train")

        print('Loading data into fold: ' + str(i))
        
        # loop trough the TRAINING images
        for item in folds[i-1][0].iterrows():
            
            if(item[1]['finding'] == 'Normal'):
                target = h5f.get("/folds/" + str(i) + "/train/Normal")
            elif(item[1]['finding'] == 'Pneumonia'):
                target = h5f.get("/folds/" + str(i) + "/train/Pneumonia")
            else:
                target = h5f.get("/folds/" + str(i) + "/train/COVID-19")

            if(item[1]['images'].size == 1):
                img = cv2.imread(os.path.join(src, str(item[1]['images'])), cv2.IMREAD_UNCHANGED)
                img_ds = target.create_dataset(str(item[1]['images']), data=img)
            else:
                for j in item[1]['images'].tolist():
                    img = cv2.imread(os.path.join(src,str(j)), cv2.IMREAD_UNCHANGED)
                    img_ds = target.create_dataset(str(j), data=img)

Loading data into fold: 1
Loading data into fold: 2
Loading data into fold: 3
Loading data into fold: 4
Loading data into fold: 5


In [15]:
src = './images/3A_images/'

# loading the VALIDATION images into the HDF5 file (images are NOT resized)
# -------------------------------------------------------------------------

for i in range(1, len(folds) + 1):
    with h5py.File('COVID-classification.hdf5','r+') as h5f:
        
        target = h5f.get("/folds/" + str(i) + "/val")

        print('Loading data into fold: ' + str(i))
        
        # loop trough the VALIDATION images
        for item in folds[i-1][1].iterrows():
            
            if(item[1]['finding'] == 'Normal'):
                target = h5f.get("/folds/" + str(i) + "/val/Normal")
            elif(item[1]['finding'] == 'Pneumonia'):
                target = h5f.get("/folds/" + str(i) + "/val/Pneumonia")
            else:
                target = h5f.get("/folds/" + str(i) + "/val/COVID-19")

            if(item[1]['images'].size == 1):
                img = cv2.imread(os.path.join(src, str(item[1]['images'])), cv2.IMREAD_UNCHANGED)
                img_ds = target.create_dataset(str(item[1]['images']), data=img)
            else:
                for j in item[1]['images'].tolist():
                    img = cv2.imread(os.path.join(src,str(j)), cv2.IMREAD_UNCHANGED)
                    img_ds = target.create_dataset(str(j), data=img)

Loading data into fold: 1
Loading data into fold: 2
Loading data into fold: 3
Loading data into fold: 4
Loading data into fold: 5


In [None]:
# loading TEST images into HDF5 dataset
# -------------------------------------

In [16]:
test_metadata = pd.read_csv('test_metadata.csv')

In [18]:
# findig the images for the patients in the TEST set
# --------------------------------------------------

test_patient_images = pd.read_csv('txt/test_COVIDx_CT-3A.csv', sep=' ',header=0)
test_full_set = pd.DataFrame(columns=test_metadata.columns.values.tolist())
test_metadata['images'] = ' '
test_full_set['images'] = ' '

In [19]:
# matching images to specific patient IDs
for i, row in test_metadata.iterrows():
    mask = test_patient_images['fname'].str.match(pat='{}[^A-Za-z0-9]'.format(row['patient id']))
    images = np.array(test_patient_images[mask])
    if images.size == 0:
        continue
    else:
        test_full_set.loc[i] = row
        test_full_set.at[i,'images'] = images[:,0]

In [29]:
src = './images/3A_images/'
# loading the TRAINING images into the HDF5 file (image NOT resized)
with h5py.File('COVID-classification.hdf5','r+') as h5f:
        
    target = h5f.get("/folds/test")
        
    for item in test_full_set.iterrows():
            
        if(item[1]['finding'] == 'Normal'):
            target = h5f.get("/folds/test/Normal")
        elif(item[1]['finding'] == 'Pneumonia'):
            target = h5f.get("/folds/test/Pneumonia")
        else:
            target = h5f.get("/folds/test/COVID-19")

        if(item[1]['images'].size == 1):
            img = cv2.imread(os.path.join(src, str(item[1]['images'])), cv2.IMREAD_UNCHANGED)
            img_ds = target.create_dataset(str(item[1]['images']), data=img)
        else:
            for j in item[1]['images'].tolist():
                img = cv2.imread(os.path.join(src,str(j)), cv2.IMREAD_UNCHANGED)
                img_ds = target.create_dataset(str(j), data=img)

In [42]:
# open .txt file and write TEST data to txt file
with open('test_set.txt', 'w+') as f:
    for item in test_full_set.iterrows():
        f.write('%s\n' %item[1]['patient id'])
        f.write('%s\n' %item[1]['finding'])
        f.write('%s\n' %item[1]['images'])
# close the file
f.close()

In [30]:
# READ FROM SAVED FILES

In [48]:
import h5py
filename = "COVID-classification.hdf5"

with h5py.File(filename, "r") as f:
    # Print all root level object names (aka keys) 
    # these can be group or dataset names 
    print(f.get('/folds/2/train/Normal'))
    print(f.get('/folds/2/train/Pneumonia'))
    print(f.get('/folds/2/train/COVID-19'))

    print(f.get('/folds/2/val/Normal'))
    print(f.get('/folds/2/val/Pneumonia'))
    print(f.get('/folds/2/val/COVID-19'))
    
    print(f.get('/folds/test/Normal'))
    print(f.get('/folds/test/Pneumonia'))
    print(f.get('/folds/test/COVID-19'))

<HDF5 group "/folds/2/train/Normal" (35482 members)>
<HDF5 group "/folds/2/train/Pneumonia" (29551 members)>
<HDF5 group "/folds/2/train/COVID-19" (241911 members)>
<HDF5 group "/folds/2/val/Normal" (10262 members)>
<HDF5 group "/folds/2/val/Pneumonia" (5427 members)>
<HDF5 group "/folds/2/val/COVID-19" (58963 members)>
<HDF5 group "/folds/test/Normal" (15968 members)>
<HDF5 group "/folds/test/Pneumonia" (7965 members)>
<HDF5 group "/folds/test/COVID-19" (7437 members)>


In [43]:
h5f.close()

In [61]:
# Testing split
# -------------

In [55]:
# Distribution of class labels in training data
for i in range(0, len(folds)):
    print(folds[i][0]['finding'].value_counts(normalize=True))

finding
COVID-19     0.764266
Pneumonia    0.176731
Normal       0.059003
Name: proportion, dtype: float64
finding
COVID-19     0.765097
Pneumonia    0.174792
Normal       0.060111
Name: proportion, dtype: float64
finding
COVID-19     0.767036
Pneumonia    0.172853
Normal       0.060111
Name: proportion, dtype: float64
finding
COVID-19     0.768634
Pneumonia    0.172070
Normal       0.059296
Name: proportion, dtype: float64
finding
COVID-19     0.756996
Pneumonia    0.183430
Normal       0.059573
Name: proportion, dtype: float64


In [54]:
# Distribution of class labels in validation data
for i in range(0, len(folds)):
    print(folds[i][1]['finding'].value_counts(normalize=True))

finding
COVID-19     0.764967
Pneumonia    0.172949
Normal       0.062084
Name: proportion, dtype: float64
finding
COVID-19     0.761641
Pneumonia    0.180710
Normal       0.057650
Name: proportion, dtype: float64
finding
COVID-19     0.75388
Pneumonia    0.18847
Normal       0.05765
Name: proportion, dtype: float64
finding
COVID-19     0.747508
Pneumonia    0.191584
Normal       0.060908
Name: proportion, dtype: float64
finding
COVID-19     0.794020
Pneumonia    0.146179
Normal       0.059801
Name: proportion, dtype: float64


In [None]:
# Alternative split using KFold
# -----------------------------

In [133]:
from sklearn.model_selection import KFold

In [135]:
kf = KFold(n_splits=number_of_folds, shuffle=True, random_state=11)

In [137]:
folds_kfolds = []

In [139]:
for i, (train_index, val_index) in enumerate(kf.split(full_set)):
    split = []
    split.append(full_set.iloc[train_index])
    split.append(full_set.iloc[val_index])
    folds_kfolds.append(split)

In [141]:
for i in range(0, len(folds_kfolds)):
    print("Training set length: " + str(len(folds_kfolds[i][0])))
    print("Validation set length: " + str(len(folds_kfolds[i][1])))
    print("Total dataset length: " + str(len(folds_kfolds[i][0]) + len(folds_kfolds[i][1])))
    print("\n")

Training set length: 4452
Validation set length: 1113
Total dataset length: 5565


Training set length: 4452
Validation set length: 1113
Total dataset length: 5565


Training set length: 4452
Validation set length: 1113
Total dataset length: 5565


Training set length: 4452
Validation set length: 1113
Total dataset length: 5565


Training set length: 4452
Validation set length: 1113
Total dataset length: 5565




In [143]:
# Distribution of class labels in training data
for i in range(0, len(folds_kfolds)):
    print(folds_kfolds[i][0]['finding'].value_counts(normalize=True))

finding
COVID-19     0.772237
Pneumonia    0.140836
Normal       0.086927
Name: proportion, dtype: float64
finding
COVID-19     0.772013
Pneumonia    0.140162
Normal       0.087826
Name: proportion, dtype: float64
finding
COVID-19     0.766622
Pneumonia    0.148248
Normal       0.085130
Name: proportion, dtype: float64
finding
COVID-19     0.769766
Pneumonia    0.142183
Normal       0.088050
Name: proportion, dtype: float64
finding
COVID-19     0.770216
Pneumonia    0.141959
Normal       0.087826
Name: proportion, dtype: float64


In [145]:
# Distribution of class labels in validation data
for i in range(0, len(folds_kfolds)):
    print(folds_kfolds[i][1]['finding'].value_counts(normalize=True))

finding
COVID-19     0.761905
Pneumonia    0.150045
Normal       0.088050
Name: proportion, dtype: float64
finding
COVID-19     0.762803
Pneumonia    0.152740
Normal       0.084456
Name: proportion, dtype: float64
finding
COVID-19     0.784367
Pneumonia    0.120395
Normal       0.095238
Name: proportion, dtype: float64
finding
COVID-19     0.771788
Pneumonia    0.144654
Normal       0.083558
Name: proportion, dtype: float64
finding
COVID-19     0.769991
Pneumonia    0.145553
Normal       0.084456
Name: proportion, dtype: float64


In [147]:
from pandas.testing import assert_frame_equal

In [149]:
for i in range(0, len(folds_kfolds)):
    print(folds_kfolds[0][1]['patient id'].reset_index(drop=True) == folds_kfolds[i][1]['patient id'].reset_index(drop=True))

0       True
1       True
2       True
3       True
4       True
        ... 
1108    True
1109    True
1110    True
1111    True
1112    True
Name: patient id, Length: 1113, dtype: bool
0       False
1       False
2       False
3       False
4       False
        ...  
1108    False
1109    False
1110    False
1111    False
1112    False
Name: patient id, Length: 1113, dtype: bool
0       False
1       False
2       False
3       False
4       False
        ...  
1108    False
1109    False
1110    False
1111    False
1112    False
Name: patient id, Length: 1113, dtype: bool
0       False
1       False
2       False
3       False
4       False
        ...  
1108    False
1109    False
1110    False
1111    False
1112    False
Name: patient id, Length: 1113, dtype: bool
0       False
1       False
2       False
3       False
4       False
        ...  
1108    False
1109    False
1110    False
1111    False
1112    False
Name: patient id, Length: 1113, dtype: bool


In [155]:
folds[0][0]

Unnamed: 0,patient id,source,country,sex,age,finding,verified finding,slice selection,view,modality,images
0,CP_10,CNCB,China,,,Pneumonia,Yes,Automatic,Axial,CT,"[[CP_10_3153_0172.png, 1, 30, 93, 474, 419], [..."
1,CP_1068,CNCB,China,,,Pneumonia,Yes,Automatic,Axial,CT,"[[CP_1068_3107_0021.png, 1, 0, 113, 512, 388]]"
2,CP_1071,CNCB,China,,,Pneumonia,Yes,Expert,Axial,CT,"[[CP_1071_3114_0035.png, 1, 33, 117, 501, 451]..."
3,CP_1072,CNCB,China,,,Pneumonia,Yes,Expert,Axial,CT,"[[CP_1072_3115_0037.png, 1, 46, 73, 472, 385],..."
4,CP_1073,CNCB,China,,,Pneumonia,Yes,Expert,Axial,CT,"[[CP_1073_3116_0031.png, 1, 22, 71, 503, 389],..."
...,...,...,...,...,...,...,...,...,...,...,...
5557,COVIDCTMD-normal066,COVID-CT-MD,Iran,M,30.0,Normal,Yes,,Axial,CT,"[[COVIDCTMD-normal066-IM0001.png, 0, 0, 159, 5..."
5558,COVIDCTMD-normal069,COVID-CT-MD,Iran,F,26.0,Normal,Yes,,Axial,CT,"[[COVIDCTMD-normal069-IM0001.png, 0, 0, 186, 5..."
5559,COVIDCTMD-normal070,COVID-CT-MD,Iran,M,60.0,Normal,Yes,,Axial,CT,"[[COVIDCTMD-normal070-IM0001.png, 0, 0, 161, 5..."
5563,COVIDCTMD-normal074,COVID-CT-MD,Iran,M,40.0,Normal,Yes,,Axial,CT,"[[COVIDCTMD-normal074-IM0001.png, 0, 0, 126, 5..."


In [157]:
folds_kfolds[0][0]

Unnamed: 0,patient id,source,country,sex,age,finding,verified finding,slice selection,view,modality,images
0,CP_10,CNCB,China,,,Pneumonia,Yes,Automatic,Axial,CT,"[[CP_10_3153_0172.png, 1, 30, 93, 474, 419], [..."
2,CP_1071,CNCB,China,,,Pneumonia,Yes,Expert,Axial,CT,"[[CP_1071_3114_0035.png, 1, 33, 117, 501, 451]..."
3,CP_1072,CNCB,China,,,Pneumonia,Yes,Expert,Axial,CT,"[[CP_1072_3115_0037.png, 1, 46, 73, 472, 385],..."
4,CP_1073,CNCB,China,,,Pneumonia,Yes,Expert,Axial,CT,"[[CP_1073_3116_0031.png, 1, 22, 71, 503, 389],..."
6,CP_1076,CNCB,China,,,Pneumonia,Yes,Expert,Axial,CT,"[[CP_1076_3120_0045.png, 1, 57, 84, 468, 396],..."
...,...,...,...,...,...,...,...,...,...,...,...
5560,COVIDCTMD-normal071,COVID-CT-MD,Iran,F,43.0,Normal,Yes,,Axial,CT,"[[COVIDCTMD-normal071-IM0001.png, 0, 0, 97, 51..."
5561,COVIDCTMD-normal072,COVID-CT-MD,Iran,M,31.0,Normal,Yes,,Axial,CT,"[[COVIDCTMD-normal072-IM0001.png, 0, 0, 139, 5..."
5562,COVIDCTMD-normal073,COVID-CT-MD,Iran,M,39.0,Normal,Yes,,Axial,CT,"[[COVIDCTMD-normal073-IM0001.png, 0, 0, 111, 5..."
5563,COVIDCTMD-normal074,COVID-CT-MD,Iran,M,40.0,Normal,Yes,,Axial,CT,"[[COVIDCTMD-normal074-IM0001.png, 0, 0, 126, 5..."
