# Data splitter
Split data to train/validation/test folders

In [36]:
import os, shutil
import pandas as pd

In [37]:
# check if this notebook called from main one
try: IS_MAIN
except: IS_MAIN = False

In [38]:
# setup necessary parameters
transform_images = True

if IS_MAIN:
    print('DataSplitter mode: MAIN')
    split = {'train':0.7, 'validation':0.1, 'test':0.2} # must sumup to 1
    
else:  
    print(f'DataSplitter mode: STANDALONE ')
    split = {'train':0.6, 'validation':0.3, 'test':0.2} # must sumup to 1



DataSplitter mode: STANDALONE 


### Define folder names and create them if nessesery

In [40]:
from os.path import join

# Input folder
original_dataset_dir = 'data/ISIC'
original_images_dir = os.path.join(original_dataset_dir, 'images')
original_masks_dir = os.path.join(original_dataset_dir, 'masks')

# Output folders 
base_dir = 'data/raw'
seg_dir = join(base_dir, 'segmentation')
clf_dir = join(base_dir, 'classification')

# Segmentation
seg_validation_dir = join(seg_dir, 'validation')
seg_train_dir = join(seg_dir, 'train')
seg_test_dir = join(seg_dir, 'test')

seg_validation_img_dir = join(seg_validation_dir, 'images')
seg_validation_msk_dir = join(seg_validation_dir, 'masks')
seg_train_img_dir = join(seg_train_dir, 'images')
seg_train_msk_dir = join(seg_train_dir, 'masks')
seg_test_img_dir = join(seg_test_dir, 'images')
seg_test_msk_dir = join(seg_test_dir, 'masks')

# Classification
clf_validation_dir = join(clf_dir, 'validation')
clf_train_dir = join(clf_dir, 'train')
clf_test_dir = join(clf_dir, 'test')

clf_validation_nev_dir = join(clf_validation_dir, 'nevus')
clf_validation_mel_dir = join(clf_validation_dir, 'melanoma')
clf_train_nev_dir = join(clf_train_dir, 'nevus')
clf_train_mel_dir = join(clf_train_dir, 'melanoma')
clf_test_nev_dir = join(clf_test_dir, 'nevus')
clf_test_mel_dir = join(clf_test_dir, 'melanoma')


# Info filenames
images_info_fn = 'images_info.csv'
masks_info_fn = 'masks_info.csv'

In [41]:
# Remove existing files 
print(f'Deleting old {base_dir}')
try:
    shutil.rmtree(base_dir)
except:
    print('Folders do not exist')
    
# Create folders
print(f'Creating new empty {base_dir} tree')
def mkdir(dir_path): 
    if not os.path.exists(dir_path): os.makedirs(dir_path)

mkdir(seg_validation_img_dir)
mkdir(seg_validation_msk_dir)
mkdir(seg_train_img_dir)
mkdir(seg_train_msk_dir)
mkdir(seg_test_img_dir)
mkdir(seg_test_msk_dir)
        
mkdir(clf_validation_nev_dir)
mkdir(clf_validation_mel_dir)
mkdir(clf_train_nev_dir)
mkdir(clf_train_mel_dir)
mkdir(clf_test_nev_dir)
mkdir(clf_test_mel_dir)

Deleting old data/raw
Folders do not exist
Creating new empty data/raw tree


In [43]:
#Get names of images and masks 
masksinfo_fn = os.path.join(original_dataset_dir, masks_info_fn)
imagesinfo_fn = os.path.join(original_dataset_dir, images_info_fn)

masks_info = pd.read_csv(masksinfo_fn)
images_info = pd.read_csv(imagesinfo_fn)

images_names = list(masks_info['name'])
if not IS_MAIN:
    # smaller number for testing
    images_names = images_names[:500]

# Get splited filenames
train_split = int(len(images_names) * split['train'])
validation_split = int(len(images_names) * split['validation'])
test_split = int(len(images_names) * split['test'])

train_images_filenames = images_names[:train_split]
validation_images_filenames = images_names[train_split:train_split+validation_split]
test_images_filenames = images_names[train_split+validation_split:]

print('Total images:     {}'.format(len(images_names)))
print('Train split:      {}'.format(len(train_images_filenames)))
print('Validation split: {}'.format(len(validation_images_filenames)))
print('Test split:       {}'.format(len(test_images_filenames)))
print('-'*23)
print('Split sum:        {}'.format(len(train_images_filenames)+len(validation_images_filenames)+len(test_images_filenames)))


Total images:     500
Train split:      300
Validation split: 150
Test split:       50
-----------------------
Split sum:        500


In [55]:
print('Copy images and masks for segmentation and classifiaction...')
def copy_files(filenames, origin_img_dir, origin_msk_dir, 
               clf_nev_dir, clf_mel_dir, seg_img_dir, seg_msk_dir, images_info=images_info):
    for image_name in filenames:
        # get diagnosis for this image
        condition = images_info['isic_name'] == 'nevus'
        diagnosis = images_info[condition]['diagnosis'].item()

        if diagnosis == 'nevus':
            dst_clf_img = join(clf_nev_dir, image_name+'.png') 
        else:
            dst_clf_img = join(clf_mel_dir, image_name+'.png')

        src_img = join(origin_img_dir,  image_name+'.png')
        dst_seg_img = join(seg_img_dir, image_name+'.png')

        src_mask = join(origin_msk_dir,  image_name+'_mask.png')
        dst_seg_mask = join(seg_msk_dir, image_name+'_mask.png')

        shutil.copyfile(src_img, dst_clf_img) 
        shutil.copyfile(src_img, dst_seg_img) 
        shutil.copyfile(src_mask, dst_seg_mask) 

    
print('Copy train images and masks...')
copy_files(train_images_filenames, 
           original_images_dir, original_masks_dir, 
           clf_train_nev_dir, clf_train_mel_dir, 
           seg_train_img_dir, seg_train_msk_dir)

print('Copy validation images and masks...')
copy_files(validation_images_filenames, 
           original_images_dir, original_masks_dir, 
           clf_validation_nev_dir, clf_validation_mel_dir, 
           seg_validation_img_dir, seg_validation_msk_dir)

print('Copy test images and masks...')
copy_files(test_images_filenames, 
           original_images_dir, original_masks_dir, 
           clf_test_nev_dir, clf_test_mel_dir, 
           seg_test_img_dir, seg_test_msk_dir)

Copy images and masks for segmentation and classifiaction...
Copy train images and masks...
Copy validation images and masks...
Copy test images and masks...


In [62]:
print('Segmentation train images:     {:4} in `{}`'.format(len(os.listdir(seg_train_img_dir)),seg_train_img_dir))
print('Segmentation train masks:      {:4} in `{}`'.format(len(os.listdir(seg_train_msk_dir)),seg_train_msk_dir)) 
print('Segmentation validation images:{:4} in `{}`'.format(len(os.listdir(seg_validation_img_dir)),seg_validation_img_dir))
print('Segmentation validation masks: {:4} in `{}`'.format(len(os.listdir(seg_validation_msk_dir)),seg_validation_msk_dir))
print('Segmentation test images:      {:4} in `{}`'.format(len(os.listdir(seg_test_img_dir)),seg_test_img_dir))
print('Segmentation test masks:       {:4} in `{}`'.format(len(os.listdir(seg_test_msk_dir)),seg_test_msk_dir))
print('-'*80)
print('Classification train nevus:        {:4} in `{}`'.format(len(os.listdir(clf_train_nev_dir)),clf_train_nev_dir))
print('Classification train melanoma:     {:4} in `{}`'.format(len(os.listdir(clf_train_mel_dir)),clf_train_mel_dir)) 
print('Classification validation nevus:   {:4} in `{}`'.format(len(os.listdir(clf_validation_nev_dir)),clf_validation_nev_dir))
print('Classification validation melanoma:{:4} in `{}`'.format(len(os.listdir(clf_validation_mel_dir)),clf_validation_mel_dir))
print('Classification test nevus:         {:4} in `{}`'.format(len(os.listdir(clf_test_nev_dir)),clf_test_nev_dir))
print('Classification test melanoma:      {:4} in `{}`'.format(len(os.listdir(clf_test_mel_dir)),clf_test_mel_dir))

Segmentation train images:      300 in `data/raw/segmentation/train/images`
Segmentation train masks:       300 in `data/raw/segmentation/train/masks`
Segmentation validation images: 150 in `data/raw/segmentation/validation/images`
Segmentation validation masks:  150 in `data/raw/segmentation/validation/masks`
Segmentation test images:        50 in `data/raw/segmentation/test/images`
Segmentation test masks:         50 in `data/raw/segmentation/test/masks`
--------------------------------------------------------------------------------
Classification train nevus:         216 in `data/raw/classification/train/nevus`
Classification train melanoma:       84 in `data/raw/classification/train/melanoma`
Classification validation nevus:    105 in `data/raw/classification/validation/nevus`
Classification validation melanoma:  45 in `data/raw/classification/validation/melanoma`
Classification test nevus:           41 in `data/raw/classification/test/nevus`
Classification test melanoma:         

In [None]:
# run preprocessing on train images
%run '2.Preprocessing.ipynb'