In [1]:
import numpy as np
import os
from tqdm.notebook import tqdm
from skimage.transform import resize
from keras.preprocessing.image import img_to_array, load_img

In [2]:
ROOT_DIR = '../datasets/lung_segmentation/'
IMG_DIR = os.path.join(ROOT_DIR, 'CXR_png')
MASK_DIR = os.path.join(ROOT_DIR, 'masks')

img_w = 128
img_h = 128

In [3]:
# There are fewer masks than images. 
# Masks have two file naming conventions mixed together, 
#   MCUCXR_####_#.png         and another with a _mask suffix, 
#   CHNCXR_####_#_mask.png

img_filenames = os.listdir(IMG_DIR)
mask_filenames = os.listdir(MASK_DIR)

# Find the img_ids for xrays with an available mask
mask_stripped = [fname.split('.png')[0] for fname in mask_filenames]
img_ids = [fname.split('_mask')[0] for fname in mask_stripped]
check = [i for i in mask_stripped if "mask" in i]

print('There are {0:d} images with {1:d} masks'.format(len(img_filenames), 
                                                        len(mask_filenames)))
print('{0:d} of the {1:d} masks have a filename suffix'.format(len(check), 
                                                                len(mask_filenames)))

There are 800 images with 704 masks
566 of the 704 masks have a filename suffix


In [4]:
X = np.empty(shape=(len(img_ids), img_h, img_w, 1), dtype='f4')
Y = np.empty(shape=(len(img_ids), img_h, img_w, 1), dtype='f4')

for i, mask_filename in tqdm(enumerate(mask_filenames), total=len(mask_filenames)):
    
    # Find the image's filename from the img_id
    img_id = mask_filename.split('.png')[0].split('_mask')[0]
    img_filename = img_id + '.png'
    
    mask = load_img(os.path.join(MASK_DIR, mask_filename), color_mode='grayscale')
    img  = load_img(os.path.join(IMG_DIR, img_filename), color_mode='grayscale')
    
    mask  = img_to_array(mask)
    x_img = img_to_array(img)
    
    mask  = resize(mask, (img_w, img_h, 1), mode='constant', preserve_range=True)
    x_img = resize(x_img, (img_w, img_h, 1), mode='constant', preserve_range=True)
    
    X[i] = x_img / 255.0
    Y[i] = mask / 255.0

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=704.0), HTML(value='')))




In [5]:
print('Memory usage \n \
X: {0:.1f} MB, Y: {1:.1f} MB'.format(X.nbytes/1e6, Y.nbytes/1e6))

Memory usage 
 X: 46.1 MB, Y: 46.1 MB


In [6]:
np.save('../mydatadir/lung_segmentation/X_train.npy', X)
np.save('../mydatadir/lung_segmentation/y_train.npy', Y)