In [5]:
import os
import shutil
from sklearn.model_selection import train_test_split

input_directory = './processed_data/'
output_train_directory = input_directory + "train/"
output_test_directory = input_directory + "test/"

def splitDatasetOnTrainTest(input_directory, output_train_directory, output_test_directory, test_size=0.2, random_seed=42):
    # Create output directories if they don't exist
    os.makedirs(output_train_directory, exist_ok=True)
    os.makedirs(output_test_directory, exist_ok=True)

    # Iterate through subdirectories (each representing a class)
    for class_name in os.listdir(input_directory):
        class_directory = os.path.join(input_directory, class_name)

        # Check if it's a directory
        if os.path.isdir(class_directory):
            # print(f"Processing class: {class_name}")

            # List all image files in the class directory
            image_files = [file for file in os.listdir(class_directory)]

            # Check if there are at least two samples for the class
            if len(image_files) < 2:
                print(f"   Warning: Insufficient samples for class {class_name}. Skipping.")
                continue

            # Split the images into train and test sets
            train_images, test_images = train_test_split(image_files, test_size=test_size, random_state=random_seed)

            # Copy images to the respective output directories
            for image in train_images:
                source_path = os.path.join(class_directory, image)
                target_path = os.path.join(output_train_directory, class_name, image)
                os.makedirs(os.path.dirname(target_path), exist_ok=True)
                shutil.move(source_path, target_path)

            for image in test_images:
                source_path = os.path.join(class_directory, image)
                target_path = os.path.join(output_test_directory, class_name, image)
                os.makedirs(os.path.dirname(target_path), exist_ok=True)
                shutil.move(source_path, target_path)

            print(f"   {len(train_images)} images for training, {len(test_images)} images for testing")

   

In [6]:
splitDatasetOnTrainTest(input_directory, output_train_directory, output_test_directory)

   1 images for training, 1 images for testing
   77 images for training, 20 images for testing
   2 images for training, 1 images for testing
   124 images for training, 31 images for testing
