In [3]:
import os
import shutil
from sklearn.model_selection import train_test_split

def split_data(input_dir, output_dir, test_size=0.2, validation_size=0.1, random_seed=42):
    """
    Split data in input_dir into training, test, and validation sets and organize in output_dir.

    Parameters:
    - input_dir (str): Path to the input data directory where classes are organized in folders.
    - output_dir (str): Path to the output directory where the split data will be organized.
    - test_size (float): The proportion of the data to include in the test split (default is 0.2).
    - validation_size (float): The proportion of the data to include in the validation split (default is 0.1).
    - random_seed (int): Seed for random number generator for reproducibility (default is 42).
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Get the list of class folders in the input directory
    class_folders = [folder for folder in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, folder))]

    for class_folder in class_folders:
        class_input_dir = os.path.join(input_dir, class_folder)
        class_output_train_dir = os.path.join(output_dir, 'train', class_folder)
        class_output_test_dir = os.path.join(output_dir, 'test', class_folder)
        class_output_validation_dir = os.path.join(output_dir, 'validation', class_folder)

        # Create class output directories if they don't exist
        if not os.path.exists(class_output_train_dir):
            os.makedirs(class_output_train_dir)
        if not os.path.exists(class_output_test_dir):
            os.makedirs(class_output_test_dir)
        if not os.path.exists(class_output_validation_dir):
            os.makedirs(class_output_validation_dir)

        # List all files in the class folder
        class_files = os.listdir(class_input_dir)

        # Print the number of items in each class
        print(f"Class '{class_folder}': {len(class_files)} items")

        # Split the data into training, test, and validation sets
        remaining_files, test_files = train_test_split(class_files, test_size=test_size, random_state=random_seed)
        train_files, validation_files = train_test_split(remaining_files, test_size=validation_size, random_state=random_seed)

        # Copy training files to the output directory
        for file in train_files:
            src_path = os.path.join(class_input_dir, file)
            dest_path = os.path.join(class_output_train_dir, file)
            shutil.copy(src_path, dest_path)

        # Copy test files to the test folder
        for file in test_files:
            src_path = os.path.join(class_input_dir, file)
            dest_path = os.path.join(class_output_test_dir, file)
            shutil.copy(src_path, dest_path)

        # Copy validation files to the validation folder
        for file in validation_files:
            src_path = os.path.join(class_input_dir, file)
            dest_path = os.path.join(class_output_validation_dir, file)
            shutil.copy(src_path, dest_path)

# Example usage:
input_directory = 'Sugarcane Leaf Image Dataset'
output_directory = 'Sugarcane Leaf Image Dataset Split'
split_data(input_directory, output_directory)


Class 'Banded Chlorosis': 471 items
Class 'Brown Spot': 1722 items
Class 'BrownRust': 314 items
Class 'Dried Leaves': 343 items
Class 'Grassy shoot': 346 items
Class 'Healthy Leaves': 430 items
Class 'Pokkah Boeng': 297 items
Class 'Sett Rot': 652 items
Class 'smut': 316 items
Class 'Viral Disease': 663 items
Class 'Yellow Leaf': 1194 items
