## Script para dividir el conjunto de datos en: train, valid y test

In [4]:
#Importar la libreria para el manejo de archivos
import os
import random
from shutil import copyfile
import shutil

In [5]:
def img_train_test_split(img_source_dir, train_size, validation_size):
    """
    Parámetros
    ----------
    img_source_dir : string
    Directorio de las imágenes     
        
    train_size : float
    Porcentaje de la muestra de entrenamiento: 0.80 (80%)
    
    validation_size : float
    Porcentaje de la muestra de validación: 0.15 (15%)
    
    El restante 5% proporciona el numero de imágenes de prueba, imagenes que nunca ha visto el modelo 
    """
        
    # Configurar la estructura de carpetas vacías si no existe
    
    if not os.path.exists('dataset'):
        os.makedirs('dataset')        
    else :
        shutil.rmtree('dataset')
    
    subdir_fullpath = img_source_dir
    
    if len(os.listdir(subdir_fullpath)) == 0:
        print(subdir_fullpath + ' is empty')

    train_subdir = 'dataset/train'
    validation_subdir = 'dataset/valid'
    test_subdir = 'dataset/test'

        
    # Cree subdirectorios en carpetas de entrenamiento, validación y test
    if not os.path.exists(train_subdir):
        os.makedirs(train_subdir)

    if not os.path.exists(validation_subdir):
        os.makedirs(validation_subdir)
            
    if not os.path.exists(test_subdir):
        os.makedirs(test_subdir)

    train_counter = 0
    validation_counter = 0
    test_counter = 0
    
    # Contar el número de imagenes totales
    count_images=0
    for filename in os.listdir(subdir_fullpath):
        if filename.endswith(".jpg"):  
            count_images+= 1
    print(count_images)
    
    total_images=count_images
    count_images=0
    
    #Ordenar de manera aleatoria las imagenes
    list_files=os.listdir(subdir_fullpath)
    random.shuffle(list_files)
    
    #Separacion de imagenes
    for filename in list_files:
        if filename.endswith(".jpg"): 
            fileparts = filename.split('.')
            if count_images <= int(total_images*train_size):
                copyfile(os.path.join(subdir_fullpath, filename), os.path.join(train_subdir, filename))
                copyfile(os.path.join(subdir_fullpath, fileparts[0] + '.txt'), os.path.join(train_subdir, fileparts[0] + '.txt') )
                copyfile(os.path.join(subdir_fullpath, fileparts[0] + '.xml'), os.path.join(train_subdir, fileparts[0] + '.xml') )
                train_counter += 1
            elif  count_images > int(total_images*train_size) and count_images <= int(total_images*(train_size + validation_size ))  :
                copyfile(os.path.join(subdir_fullpath, filename), os.path.join(validation_subdir, filename))
                copyfile(os.path.join(subdir_fullpath, fileparts[0] + '.txt'), os.path.join(validation_subdir, fileparts[0] + '.txt') )
                copyfile(os.path.join(subdir_fullpath, fileparts[0] + '.xml'), os.path.join(validation_subdir, fileparts[0] + '.xml') )
                validation_counter += 1
            elif count_images > int(total_images*(train_size + validation_size )) :
                copyfile(os.path.join(subdir_fullpath, filename), os.path.join(test_subdir,filename))
                copyfile(os.path.join(subdir_fullpath, fileparts[0] + '.txt'), os.path.join(test_subdir, fileparts[0] + '.txt') )
                copyfile(os.path.join(subdir_fullpath, fileparts[0] + '.xml'), os.path.join(test_subdir, fileparts[0] + '.xml') )
                test_counter += 1
            count_images += 1
                    
    print('Copied ' + str(train_counter) + ' images to dataset/train/'  )
    print('Copied ' + str(validation_counter) + ' images to dataset/validation/'  )
    print('Copied ' + str(test_counter) + ' images to dataset/test/'  )
    
    #Copiar el archivo classes.txt a los diferentes directorios pero con otro nombre    
    copyfile(os.path.join(subdir_fullpath, 'classes.txt'), os.path.join(train_subdir, "_darknet.labels"))
    copyfile(os.path.join(subdir_fullpath, 'classes.txt'), os.path.join(validation_subdir, "_darknet.labels"))
    copyfile(os.path.join(subdir_fullpath, 'classes.txt'), os.path.join(test_subdir, "_darknet.labels"))

In [6]:

 img_train_test_split("data", 0.80,0.15)

800
Copied 641 images to dataset/train/
Copied 120 images to dataset/validation/
Copied 39 images to dataset/test/
