# Data File Generation

This notebook will be used to create text files of the paths to images/masks for training/testing.

In [3]:
import os
import glob
import re
import pandas as pd

In [5]:
files = pd.read_csv('/data/shared/prostate/yale_prostate/input_lists/MR_yale.csv')

In [9]:
files['IMAGE'][0:245]

0      /data/shared/prostate/yale_prostate/MR_resampl...
1      /data/shared/prostate/yale_prostate/MR_resampl...
2      /data/shared/prostate/yale_prostate/MR_resampl...
3      /data/shared/prostate/yale_prostate/MR_resampl...
4      /data/shared/prostate/yale_prostate/MR_resampl...
                             ...                        
240    /data/shared/prostate/yale_prostate/MR_resampl...
241    /data/shared/prostate/yale_prostate/MR_resampl...
242    /data/shared/prostate/yale_prostate/MR_resampl...
243    /data/shared/prostate/yale_prostate/MR_resampl...
244    /data/shared/prostate/yale_prostate/MR_resampl...
Name: IMAGE, Length: 245, dtype: object

In [6]:
files

Unnamed: 0,IMAGE,SEGM
0,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...
1,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...
2,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...
3,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...
4,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...
...,...,...
343,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...
344,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...
345,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...
346,/data/shared/prostate/yale_prostate/MR_resampl...,/data/shared/prostate/yale_prostate/MR_mask_re...


In [2]:
data_dir = "data/"
animals = set()
print("-----Available directories:-----")
for directory in sorted(os.listdir(data_dir)):
    animal = re.search(r'\d+', directory)
    if animal != None:
        animals.add(animal.group(0))
    print(directory)
    
animals = sorted([int(animal) for animal in animals])
print("\n-----Available animals:-----")
for animal in animals:
    print(animal, end=', ')
    
images = sorted(glob.glob(data_dir + "**/*CTImg*", recursive=True))
masks = sorted(glob.glob(data_dir + "**/*Mask*", recursive=True))

-----Available directories:-----
.ipynb_checkpoints
PSEA12 Baseline
PSEA12 PostGel
PSEA12 PostMI
PSEA13 Baseline
PSEA13 PostGel
PSEA13 PostMI
PSEA14 Baseline
PSEA14 PostGel
PSEA18 Baseline
PSEA18 PostGel
PSEA18 PostMI
PSEA20 PostGel
PSEA20 PostMI
PSEA25 Baseline
PSEA25 PostGel
PSEA25 PostMI
PSEA27 Baseline
PSEA27 PostGel
PSEA27 PostMI
test_imgs.txt
test_masks.txt
train_imgs.txt
train_masks.txt

-----Available animals:-----
12, 13, 14, 18, 20, 25, 27, 

In [3]:
def check_data_alignment(images, masks):
    """Checks whether the images/masks are aligned properly."""
    for (image, mask) in zip(images, masks):
        image_sub = re.sub(r'CTImg', '', image)
        mask_sub = re.sub(r'Mask', '', mask)
        if image_sub != mask_sub:
            print(image)
            print(mask)
            return False
    return True

print('Is the data aligned: ', check_data_alignment(images, masks))

Is the data aligned:  True


In [4]:
# Change these lists to change train/test distribution
train_animals = [13, 14, 18, 20, 25, 27]
test_animals = [12]

In [5]:
def paths_to_file(images, masks, animals, group):
    """Returns the images and masks as a dictionary for specific animals."""
    filtered_images = []
    filtered_masks = []
    for animal in animals:
        filtered_images.extend(filter(lambda x: "PSEA" + str(animal) in x, images))
        filtered_masks.extend(filter(lambda x: "PSEA" + str(animal) in x, masks))
        
    with open(data_dir + group + '_imgs.txt', 'w') as f:
        f.writelines("%s\n" % image for image in filtered_images)
    with open(data_dir + group + '_masks.txt', 'w') as f:
        f.writelines("%s\n" % mask for mask in filtered_masks)

In [6]:
paths_to_file(images, masks, train_animals, 'train')
paths_to_file(images, masks, test_animals, 'test')