# Library import

In [1]:
import random
import os
import shutil

# Function for data split

In [2]:
def data_split(path_from_imgs, path_from_labels, path_to_imgs, path_to_labels, thresh):
    '''
    Function for splitting data proportionally
    (i.e same proportion of labaled examples in the subsets as in the total set)
    and saving the subset in specified paths
    
    Recieves:
    path_from_imgs: path were the images are stored
    path_from_labels: path were the labels are stored
    path_to_imgs: path were the subset images will be saved
    path_to_labels: path were the subset labels will be saved
    thresh: proportion of the dataset that will be used for test (0 to 1)
    
    Returns nothing
    '''
    
    random.seed(0) #set seed for reproductibility
    
    #Selection of the % of the labeled images for the test database
    label_list = os.listdir(path_from_labels)
    positive_subset_labels = random.sample(label_list, round(len(label_list)*thresh))
    positive_subset_imgs = [sample.split('.')[0]+'.tiff' for sample in positive_subset_labels]    
    
    #Selection of the other images of the test database
    image_list = os.listdir(path_from_imgs)

    img_names = [img.split('.')[0] for img in image_list]
    label_names = [label.split('.')[0] for label in label_list]

    negative_cases = [name for name in img_names if name not in label_names]
    negative_subset_n = round((len(positive_subset_imgs)*len(image_list))/len(label_list))
    negative_subset = random.sample(negative_cases,negative_subset_n)
    negative_subset = [sample+'.tiff' for sample in negative_subset]

    subset_imgs = positive_subset_imgs + negative_subset
    random.shuffle(subset_imgs)
    
    #check if path_to exists
    if os.path.exists(path_to_labels) == False:
        os.makedirs(path_to_labels)
        
    if os.path.exists(path_to_imgs) == False:
        os.makedirs(path_to_imgs)
    
    #Send files to the test folder
    for label in positive_subset_labels:
        shutil.move(path_from_labels+'/'+label,path_to_labels)

    for img in subset_imgs:
        shutil.move(path_from_imgs+'/'+img,path_to_imgs)            
        
    return

# Test datset 

Made using 10% of the total dataset

In [3]:
path_from_labels = '/home/ld_brito/DeepL/dataset description/dataset/labels'
path_from_imgs = '/home/ld_brito/DeepL/dataset description/dataset/images'
path_to_labels = '/home/ld_brito/DeepL/dataset description/test/labels'
path_to_imgs = '/home/ld_brito/DeepL/dataset description/test/images'

data_split(path_from_imgs,path_from_labels,path_to_imgs,path_to_labels,0.1)

# Validation dataset

Made using 30% of the remaining dataset

In [4]:
path_from_labels = '/home/ld_brito/DeepL/dataset description/dataset/labels'
path_from_imgs = '/home/ld_brito/DeepL/dataset description/dataset/images'
path_to_labels = '/home/ld_brito/DeepL/dataset description/valid/labels'
path_to_imgs = '/home/ld_brito/DeepL/dataset description/valid/images'

data_split(path_from_imgs,path_from_labels,path_to_imgs,path_to_labels,0.3)

# Train dataset

After the test and validation subsets, the remaining data will be renamed to be the train dataset

In [5]:
os.rename('/home/ld_brito/DeepL/dataset description/dataset',
          '/home/ld_brito/DeepL/dataset description/train')