In [1]:
import numpy as np
import os
import random
import shutil

from PIL import Image

random.seed(0)

# Data Preparation

### Set up data folders

In [2]:
!rm -rf data/val_images data/val_masks data/train_images data/train_masks
!mkdir data/val_images data/val_masks data/train_images data/train_masks

In [3]:
BASE_DIR = 'data/'
SRC_DIR = os.path.join(BASE_DIR, 'kaggle_3m')

TRAIN_IMAGES_DIR = os.path.join(BASE_DIR, 'train_images')
TRAIN_MASKS_DIR = os.path.join(BASE_DIR, 'train_masks')
VAL_IMAGES_DIR = os.path.join(BASE_DIR, 'val_images')
VAL_MASKS_DIR = os.path.join(BASE_DIR, 'val_masks')

print(TRAIN_IMAGES_DIR)
print(TRAIN_MASKS_DIR)
print(VAL_IMAGES_DIR)
print(VAL_MASKS_DIR)

data/train_images
data/train_masks
data/val_images
data/val_masks


In [4]:
patients = os.listdir(SRC_DIR)
patients = [patient for patient in patients if patient.startswith('TCGA')]
print(f'Total {len(patients)} patients')

Total 110 patients


In [5]:
TEST_SIZE = 0.2
num_of_patients_for_test = int(len(patients) * TEST_SIZE)

print(f'With test size = {TEST_SIZE}, {num_of_patients_for_test} patients will be used for testing')

patients_for_test = random.sample(patients, k=num_of_patients_for_test)
assert len(patients_for_test) == num_of_patients_for_test

print(f'Patients for testing are:\n{patients_for_test}')

With test size = 0.2, 22 patients will be used for testing
Patients for testing are:
['TCGA_HT_7881_19981015', 'TCGA_FG_7643_20021104', 'TCGA_FG_A4MT_20020212', 'TCGA_HT_7692_19960724', 'TCGA_DU_7010_19860307', 'TCGA_HT_7855_19951020', 'TCGA_CS_6666_20011109', 'TCGA_DU_7294_19890104', 'TCGA_HT_A61B_19991127', 'TCGA_DU_5872_19950223', 'TCGA_FG_8189_20030516', 'TCGA_FG_6691_20020405', 'TCGA_CS_5396_20010302', 'TCGA_HT_7879_19981009', 'TCGA_HT_8563_19981209', 'TCGA_CS_5397_20010315', 'TCGA_DU_A5TP_19970614', 'TCGA_DU_5849_19950405', 'TCGA_DU_A5TU_19980312', 'TCGA_DU_6401_19831001', 'TCGA_DU_7008_19830723', 'TCGA_FG_6689_20020326']


In [6]:
for patient in patients:
    src = f'{SRC_DIR}/{patient}'
    
    if patient in patients_for_test:
        dst_images = f'{VAL_IMAGES_DIR}/{patient}'
        dst_masks = f'{VAL_MASKS_DIR}/{patient}'
    else:
        dst_images = f'{TRAIN_IMAGES_DIR}/{patient}'
        dst_masks = f'{TRAIN_MASKS_DIR}/{patient}'
        
    shutil.copytree(src, dst_images)
    shutil.copytree(src, dst_masks)

In [7]:
def remove_files(dir_, endswith=None, notendswith=None):
    for patient in os.listdir(dir_):
        images = os.listdir(f'{dir_}/{patient}')
        len_before = len(images)
        
        for image in images:
            img_path = ''
            
            if endswith and image.endswith(endswith):
                img_path = f'{dir_}/{patient}/{image}'
            elif notendswith and (not image.endswith(notendswith)):
                img_path = f'{dir_}/{patient}/{image}'

            if os.path.exists(img_path):
                  os.remove(img_path)

            
        images = os.listdir(f'{dir_}/{patient}')
        len_after = len(images)
        assert len_before == 2 * len_after

In [8]:
remove_files(TRAIN_IMAGES_DIR, endswith='_mask.tif')
remove_files(VAL_IMAGES_DIR, endswith='_mask.tif')
remove_files(TRAIN_MASKS_DIR, notendswith='_mask.tif')
remove_files(VAL_MASKS_DIR, notendswith='_mask.tif')

### View MRI images

In [9]:
def get_random_img(dir_):
    random_patient = random.sample(os.listdir(dir_), k=1)[0]
    img_filename = random.sample(os.listdir(f'{dir_}/{random_patient}'), k=1)[0]
    img_path = os.path.join(f'{dir_}/{random_patient}', img_filename)
    return img_path

In [10]:
random_image = Image.open(get_random_img(TRAIN_IMAGES_DIR))
random_mask = Image.open(get_random_img(TRAIN_MASKS_DIR))

In [11]:
random_image.size, random_image.mode, random_mask.size, random_mask.mode

((256, 256), 'RGB', (256, 256), 'L')

### Convert images to numpy arrays

In [12]:
img_arr = np.asarray(random_image)
img_arr.shape

(256, 256, 3)

In [None]:
# Convert to grayscale -> 1 channel
# Increase the size of the image (upsampling): image to 572x572, mask to 388x388
# reshape channel first

# learn Unet and Semantic Segmentation courses

In [13]:
msk_arr = np.asarray(random_mask)
msk_arr.shape

(256, 256)

### Convert mask to binary