In [5]:
import os
import shutil
import random

In [6]:
def split_dataset(dataset_path, train_path, val_path, test_path, val_ratio, test_ratio):
    for class_name in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_name)
        if not os.path.isdir(class_path):
            continue

        train_class_path = os.path.join(train_path, class_name)
        val_class_path = os.path.join(val_path, class_name)
        test_class_path = os.path.join(test_path, class_name)

        if not os.path.exists(train_class_path):
            os.makedirs(train_class_path)
        if not os.path.exists(val_class_path):
            os.makedirs(val_class_path)
        if not os.path.exists(test_class_path):
            os.makedirs(test_class_path)

        images = os.listdir(class_path)
        random.shuffle(images)

        val_count = int(len(images) * val_ratio)
        test_count = int(len(images) * test_ratio)

        for i in range(len(images)):
            image = images[i]
            image_path = os.path.join(class_path, image)
            if i < val_count:
                shutil.copy(image_path, val_class_path)
            elif i < val_count + test_count:
                shutil.copy(image_path, test_class_path)
            else:
                shutil.copy(image_path, train_class_path)

In [7]:
dataset_path = 'part-1-1'
train_path = 'split/train'
val_path = 'split/val'
test_path = 'split/test'
val_ratio = 0.1
test_ratio = 0.2

split_dataset(dataset_path, train_path, val_path, test_path, val_ratio, test_ratio)

print('Dataset split into training, validation and testing datasets.\n')
print('Training dataset:\t', train_path)
print('Validation dataset:\t', val_path)
print('Testing dataset:\t', test_path)

Dataset split into training, validation and testing datasets
Training dataset:	 split/train
Validation dataset:	 split/val
Testing dataset:	 split/test


In [8]:
train_count = sum([len(files) for r, d, files in os.walk(train_path)])
val_count = sum([len(files) for r, d, files in os.walk(val_path)])
test_count = sum([len(files) for r, d, files in os.walk(test_path)])

print('\nNumber of images in training dataset:\t', train_count)
print('Number of images in validation dataset:\t', val_count)
print('Number of images in testing dataset:\t', test_count)


Number of images in training dataset:	 6514
Number of images in validation dataset:	 923
Number of images in testing dataset:	 1852
