In [41]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import os
from keras.preprocessing import image                  
from sklearn.utils import shuffle

In [42]:
diseases = [
    'Cardiomegaly','Emphysema','Effusion',
    'Hernia','Nodule','Pneumothorax',
    'Atelectasis','Pleural_Thickening',
    'Mass','Edema','Consolidation',
    'Infiltration','Fibrosis','Pneumonia',
    'No Finding'
]

dataset_df = pd.read_csv('./dataset_information/Data_Entry_2017.csv')

In [43]:
# Applying One Hot Encoding to Labels
for disease in diseases:
    dataset_df[disease] = dataset_df['Finding Labels'].apply(lambda x: 1 if disease in x else 0)

In [44]:
image_labels = dataset_df[diseases].to_numpy()
image_paths = {
    os.path.basename(x): x for x in glob(os.path.join('.', 'images', '*.png'))
}

print(f"Samples Found: {len(image_paths)}")

Samples Found: 112120


In [45]:
# Storing path to each image name in the dataframe
dataset_df['Image Path'] = dataset_df['Image Index'].map(image_paths.get)

In [46]:
images_list = dataset_df['Image Path'].tolist()

labelB = (dataset_df[diseases].sum(axis = 1) > 0).tolist()
labelB = np.array(labelB, dtype = int)

In [47]:
def read_image_to_tensor(path, shape):
    # Loads RGB image to PIL format
    img = image.load_img(path, target_size = shape)
    
    # Convert PIL image to 3D tensor of specific shape
    # and normalizes it by dividing each pixel by 255
    normalized_image_tensor = image.img_to_array(img) / 255
    
    # Convert 3D tensor to 4D tensor with specific shape 
    # (1, shape, 3) and return it
    return np.expand_dims(normalized_image_tensor, axis = 0)

In [48]:
def image_to_array(paths, shape):
    images_arrays = []
    for path in tqdm(paths, desc = "Progress", ncols = 100):
        images_arrays.append(read_image_to_tensor(path, shape))
    return images_arrays

In [49]:
train_labels = labelB[ : 75000][ : , np.newaxis]
valid_labels = labelB[75000 : 85000][ : , np.newaxis]
test_labels = labelB[85000 : ][ : , np.newaxis]

In [50]:
IMAGE_SHAPE = (64, 64)

training_samples = image_to_array(images_list[ : 75000], shape = IMAGE_SHAPE)
validation_samples = image_to_array(images_list[75000 : 85000], shape = IMAGE_SHAPE)
test_samples = image_to_array(images_list[85000 : ], shape = IMAGE_SHAPE)

Progress:   1%|▍                                                | 714/75000 [00:09<16:53, 73.31it/s]


KeyboardInterrupt: 