In [6]:
import os
import shutil
from glob import glob
import random

# Set a seed for reproducibility
random.seed(42)

def split_dataset(base_path):
    parent_path = os.path.dirname(base_path)
    new_base_path = os.path.join(parent_path, 'dl_data')

    # Directories for new train, validation, and test
    new_train_dir = os.path.join(new_base_path, 'train')
    new_valid_dir = os.path.join(new_base_path, 'valid')
    new_test_dir = os.path.join(new_base_path, 'test')

    # Ensure the new directories exist
    os.makedirs(new_train_dir, exist_ok=True)
    os.makedirs(new_valid_dir, exist_ok=True)
    os.makedirs(new_test_dir, exist_ok=True)

    # Define split ratios
    train_split = 0.6  # 60% of the total data
    valid_split = 0.2  # 20% of the total data
    test_split = 0.2   # 20% of the total data

    # Function to copy files with error handling
    def copy_files(files, dest):
        for file in files:
            try:
                shutil.copy(file, dest)
            except PermissionError as e:
                print(f"PermissionError: {e}")
            except Exception as e:
                print(f"Error copying file {file} to {dest}: {e}")

    # Function to count files in a directory
    def count_files(directory):
        return sum([len(files) for r, d, files in os.walk(directory)])

    # Check if original train and test folders exist
    original_train_dir = os.path.join(base_path, 'train')
    original_test_dir = os.path.join(base_path, 'test')

    if os.path.exists(original_train_dir) and os.path.exists(original_test_dir):
        # Get all subdirectories in the original train folder
        train_subdirs = [d for d in glob(os.path.join(original_train_dir, '*')) if os.path.isdir(d)]

        for subdir in train_subdirs:
            # Get the subdirectory name (e.g., '0', '1', ..., '9')
            subdir_name = os.path.basename(subdir)
            
            # Create corresponding subdirectory in the new train and valid directories
            new_train_subdir = os.path.join(new_train_dir, subdir_name)
            new_valid_subdir = os.path.join(new_valid_dir, subdir_name)
            os.makedirs(new_train_subdir, exist_ok=True)
            os.makedirs(new_valid_subdir, exist_ok=True)
            
            # Get all files from the subdirectory
            files = glob(os.path.join(subdir, '*'))
            
            # Shuffle files
            random.shuffle(files)
            
            # Calculate split indices
            train_end = int(len(files) * 0.75)  # 75% of the original train folder (which is 60% of the total data)
            
            # Split files
            train_files = files[:train_end]
            valid_files = files[train_end:]
            
            # Copy files to respective directories
            copy_files(train_files, new_train_subdir)
            copy_files(valid_files, new_valid_subdir)

        # Copy the original test folder to the new test directory
        test_subdirs = [d for d in glob(os.path.join(original_test_dir, '*')) if os.path.isdir(d)]
        for subdir in test_subdirs:
            subdir_name = os.path.basename(subdir)
            new_test_subdir = os.path.join(new_test_dir, subdir_name)
            os.makedirs(new_test_subdir, exist_ok=True)
            files = glob(os.path.join(subdir, '*'))
            copy_files(files, new_test_subdir)

        print("Files have been split and copied to new train, valid, and test directories while maintaining folder structure.")
    else:
        # Get all files from the base path
        files = glob(os.path.join(base_path, '*'))
        
        # Shuffle files
        random.shuffle(files)
        
        # Calculate split indices
        train_end = int(len(files) * train_split)
        valid_end = train_end + int(len(files) * valid_split)
        
        # Split files
        train_files = files[:train_end]
        valid_files = files[train_end:valid_end]
        test_files = files[valid_end:]
        
        # Copy files to respective directories
        copy_files(train_files, new_train_dir)
        copy_files(valid_files, new_valid_dir)
        copy_files(test_files, new_test_dir)

        print("Files have been split and copied to new train, valid, and test directories with a 6:2:2 ratio.")

    # Count and print the number of files in each new directory
    train_file_count = count_files(new_train_dir)
    valid_file_count = count_files(new_valid_dir)
    test_file_count = count_files(new_test_dir)

    print(f"Number of files in new train directory: {train_file_count}")
    print(f"Number of files in new valid directory: {valid_file_count}")
    print(f"Number of files in new test directory: {test_file_count}")

# Example usage
base_path = r'C:\Users\lewka\Downloads\cifar-10-python\cifar-10-batches-py\images'
split_dataset(base_path)

Files have been split and copied to new train, valid, and test directories while maintaining folder structure.
Number of files in new train directory: 37500
Number of files in new valid directory: 12500
Number of files in new test directory: 10000
