# Data splitter
Split data to train/validation/test folders

In [1]:
import os, shutil
import pandas as pd

In [2]:
# check if this notebook called from main one
try: IS_MAIN
except: IS_MAIN = False

In [8]:
# setup necessary parameters
if IS_MAIN:
    print('DataSplitter mode: MAIN')
    split = {'train':0.7, 'validation':0.1, 'test':0.2} # must sumup to 1
    
else:
    print('DataSplitter mode: STANDALONE')
    split = {'train':0.6, 'validation':0.3, 'test':0.2} # must sumup to 1


original_dataset_dir = 'data/ISIC'
original_images_dir = os.path.join(original_dataset_dir, 'images')
original_masks_dir = os.path.join(original_dataset_dir, 'masks')

base_dir = 'data'

train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'validation')
test_dir = os.path.join(base_dir, 'test')

train_images_dir = os.path.join(train_dir, 'images')
train_masks_dir = os.path.join(train_dir, 'masks')

validation_images_dir = os.path.join(validation_dir, 'images')
validation_masks_dir = os.path.join(validation_dir, 'masks')

test_images_dir = os.path.join(test_dir, 'images')
test_masks_dir = os.path.join(test_dir, 'masks')

images_info_fn = 'images_info'
masks_info_fn = 'masks_info'

DataSplitter mode: STANDALONE


In [5]:
# create folders
def mkdir(dir_path): 
    if not os.path.exists(dir_path): os.makedirs(dir_path)

mkdir(base_dir)

mkdir(train_dir)
mkdir(validation_dir)
mkdir(test_dir)

mkdir(train_images_dir)
mkdir(train_masks_dir)
mkdir(validation_images_dir)
mkdir(validation_masks_dir)
mkdir(test_images_dir)
mkdir(test_masks_dir)

In [9]:
#Get names of images and masks with split

masksinfo_file_path = os.path.join(original_dataset_dir, masks_info_fn+'.csv')
masks_info = pd.read_csv(masksinfo_file_path)

images_names = list(masks_info['name'])

train_split = int(len(images_names) * split['train'])
validation_split = int(len(images_names) * split['validation'])
test_split = int(len(images_names) * split['test'])

train_images_filenames = images_names[:train_split]
validation_images_filenames = images_names[train_split:train_split+validation_split]
test_images_filenames = images_names[train_split+validation_split:]

print('Total images:     {}'.format(len(images_names)))
print('Train split:      {}'.format(len(train_images_filenames)))
print('Validation split: {}'.format(len(validation_images_filenames)))
print('Test split:       {}'.format(len(test_images_filenames)))
print('-'*23)
print('Split sum:        {}'.format(len(train_images_filenames)+len(validation_images_filenames)+len(test_images_filenames)))

Total images:     700
Train split:      420
Validation split: 210
Test split:       70
-----------------------
Split sum:        700


In [10]:
print('Copy images and masks into the folders...')
# Copy train images and masks
for image_name in train_images_filenames:
    src_img = os.path.join(original_images_dir, image_name+'.png')
    dst_img = os.path.join(train_images_dir, image_name+'.png')
    
    src_mask = os.path.join(original_masks_dir, image_name+'_mask.png')
    dst_mask = os.path.join(train_masks_dir, image_name+'_mask.png')
    
    shutil.copyfile(src_img, dst_img) 
    shutil.copyfile(src_mask, dst_mask) 
    
# Copy validation images and masks
for image_name in validation_images_filenames:
    src_img = os.path.join(original_images_dir, image_name+'.png')
    dst_img = os.path.join(validation_images_dir, image_name+'.png')
    
    src_mask = os.path.join(original_masks_dir, image_name+'_mask.png')
    dst_mask = os.path.join(validation_masks_dir, image_name+'_mask.png')
    
    shutil.copyfile(src_img, dst_img) 
    shutil.copyfile(src_mask, dst_mask) 
    
# Copy test images and masks
for image_name in test_images_filenames:
    src_img = os.path.join(original_images_dir, image_name+'.png')
    dst_img = os.path.join(test_images_dir, image_name+'.png')
    
    src_mask = os.path.join(original_masks_dir, image_name+'_mask.png')
    dst_mask = os.path.join(test_masks_dir, image_name+'_mask.png')
    
    shutil.copyfile(src_img, dst_img) 
    shutil.copyfile(src_mask, dst_mask) 

print('done.')

Copy images and masks into the folders...
done.


In [18]:
print('Total training images:   {:4} in `{}`'.format(len(os.listdir(train_images_dir)),train_images_dir))
print('Total training masks:    {:4} in `{}`'.format(len(os.listdir(train_masks_dir)),train_masks_dir)) 
print('Total validation images: {:4} in `{}`'.format(len(os.listdir(validation_images_dir)),validation_images_dir))
print('Total validation masks:  {:4} in `{}`'.format(len(os.listdir(validation_masks_dir)),validation_masks_dir))
print('Total test images:       {:4} in `{}`'.format(len(os.listdir(test_images_dir)),test_images_dir))
print('Total test masks:        {:4} in `{}`'.format(len(os.listdir(test_masks_dir)),test_masks_dir))

Total training images:    420 in `data/train/images`
Total training masks:     420 in `data/train/masks`
Total validation images:  210 in `data/validation/images`
Total validation masks:   210 in `data/validation/masks`
Total test images:         70 in `data/test/images`
Total test masks:          70 in `data/test/masks`
