# Data File Generation

This notebook will be used to create text files of the paths to images/masks for training/testing.

In [10]:
import os
import glob
import re
import random

In [11]:
data_dir = "data/new/"
animals = set()
print("-----Available directories:-----")
for directory in sorted(os.listdir(data_dir)):
    animal = re.search(r'\w+ *\d+', directory)
    if animal != None:
        animals.add(animal.group(0))
    print(directory)
    
animals = list(animals)
print("\n-----Available animals:-----")
for animal in animals:
    print(animal, end=', ')
    
images = sorted(glob.glob(data_dir + "**/*CTImg*", recursive=True))
images = list(map(lambda x: x.replace(data_dir,''), images))
masks = sorted(glob.glob(data_dir + "**/*Mask*", recursive=True))
masks = list(map(lambda x: x.replace(data_dir,''), masks))

-----Available directories:-----
.ipynb_checkpoints
PPC 02 Baseline
PPC 02 PostGel
PPC 02 PostGel7D
PPC 02 PostMI
PPC 02 PostMI5D
PPC 03 Baseline
PPC 03 PostGel
PPC 03 PostGel7D
PPC 03 PostMI
PPC 03 PostMI5D
PPC 04 Baseline
PPC 04 PostGel
PPC 04 PostGel7D
PPC 04 PostMI
PPC 04 PostMI5D
PPC 05 Baseline
PPC 05 PostGel
PPC 05 PostGel7D
PPC 05 PostMI
PPC 05 PostMI5D
PPC 06 Baseline
PPC 06 PostGel
PPC 06 PostGel7D
PPC 06 PostMI
PPC 06 PostMI5D
PPC 07 Baseline
PPC 07 PostGel
PPC 07 PostGel7D
PPC 07 PostMI
PPC 07 PostMI5D
PPC 08 Baseline
PPC 08 PostGel
PPC 08 PostGel7D
PPC 08 PostMI
PPC 08 PostMI5D
PPC 09 Baseline
PPC 09 PostGel
PPC 09 PostGel7D
PPC 09 PostMI
PPC 09 PostMI5D
PPC 11 Baseline
PPC 11 PostGel
PPC 11 PostGel7D
PPC 11 PostMI
PPC 11 PostMI5D
PPC 12 Baseline
PPC 12 PostGel
PPC 12 PostGel7D
PPC 12 PostMI
PPC 12 PostMI5D
PPC 13 Baseline
PPC 13 PostGel
PPC 13 PostGel7D
PPC 13 PostMI
PPC 13 PostMI5D
PPC 14 Baseline
PPC 14 PostGel
PPC 14 PostGel7D
PPC 14 PostMI
PPC 14 PostMI5D
PSEA12 PostG

In [12]:
images

['PPC 02 Baseline/PPC02_Baseline_CTImg_TimeED.nii',
 'PPC 02 Baseline/PPC02_Baseline_CTImg_TimeES.nii',
 'PPC 02 PostGel/PPC02_PostGel_CTImg_TimeED.nii',
 'PPC 02 PostGel/PPC02_PostGel_CTImg_TimeES.nii',
 'PPC 02 PostGel7D/PPC02_PostGel7D_CTImg_TimeED.nii',
 'PPC 02 PostGel7D/PPC02_PostGel7D_CTImg_TimeES.nii',
 'PPC 02 PostMI/PPC02_PostMI_CTImg_TimeED.nii',
 'PPC 02 PostMI/PPC02_PostMI_CTImg_TimeES.nii',
 'PPC 02 PostMI5D/PPC02_PostMI5D_CTImg_TimeED.nii',
 'PPC 02 PostMI5D/PPC02_PostMI5D_CTImg_TimeES.nii',
 'PPC 03 Baseline/PPC03_Baseline_CTImg_TimeED.nii',
 'PPC 03 Baseline/PPC03_Baseline_CTImg_TimeES.nii',
 'PPC 03 PostGel/PPC03_PostGel_CTImg_TimeED.nii',
 'PPC 03 PostGel/PPC03_PostGel_CTImg_TimeES.nii',
 'PPC 03 PostGel7D/PPC03_PostGel7D_CTImg_TimeED.nii',
 'PPC 03 PostGel7D/PPC03_PostGel7D_CTImg_TimeES.nii',
 'PPC 03 PostMI/PPC03_PostMI_CTImg_TimeED.nii',
 'PPC 03 PostMI/PPC03_PostMI_CTImg_TimeES.nii',
 'PPC 03 PostMI5D/PPC03_PostMI5D_CTImg_TimeED.nii',
 'PPC 03 PostMI5D/PPC03_Post

In [13]:
def check_data_alignment(images, masks):
    """Checks whether the images/masks are aligned properly."""
    for (image, mask) in zip(images, masks):
        image_sub = re.sub(r'CTImg', '', image)
        mask_sub = re.sub(r'Mask', '', mask)
        if image_sub != mask_sub:
            print(image)
            print(mask)
            return False
    return True

print('Is the data aligned: ', check_data_alignment(images, masks))

Is the data aligned:  True


In [14]:
animals

['PSEA27',
 'PPC 13',
 'PPC 12',
 'PSEA20',
 'PPC 03',
 'PSEA14',
 'PPC 07',
 'PPC 11',
 'PPC 08',
 'PSEA18',
 'PPC 04',
 'PPC 02',
 'PSEA13',
 'PPC 06',
 'PSEA12',
 'PSEA25',
 'PPC 05',
 'PPC 09',
 'PPC 14']

In [15]:
# Change these lists to change train/test distribution
train_animals = random.sample(animals, 15)
test_animals =  [animal for animal in animals if animal not in train_animals]
print(train_animals)
print(test_animals)

['PSEA18', 'PPC 05', 'PPC 04', 'PPC 07', 'PSEA12', 'PPC 09', 'PPC 12', 'PPC 11', 'PPC 08', 'PSEA13', 'PPC 06', 'PSEA14', 'PSEA20', 'PPC 02', 'PPC 14']
['PSEA27', 'PPC 13', 'PPC 03', 'PSEA25']


In [16]:
len(set(train_animals + test_animals))

19

In [17]:
len(animals)

19

In [18]:
def paths_to_file(images, masks, animals, group):
    """Returns the images and masks as a dictionary for specific animals."""
    filtered_images = []
    filtered_masks = []
    for animal in animals:
        filtered_images.extend(filter(lambda x: animal in x, images))
        filtered_masks.extend(filter(lambda x: animal in x, masks))
        
    with open(data_dir + group + '_imgs.txt', 'w') as f:
        f.writelines("%s\n" % image for image in filtered_images)
    with open(data_dir + group + '_masks.txt', 'w') as f:
        f.writelines("%s\n" % mask for mask in filtered_masks)

In [19]:
paths_to_file(images, masks, train_animals, 'train')
paths_to_file(images, masks, test_animals, 'test')