In [6]:
import os
import shutil
import random

def split_yolo_dataset(images_path, labels_path, output_path, train_ratio=0.8, seed=42):
    """
    Splits a dataset of images and YOLO-format labels into training and validation sets.

    Args:
        images_path (str): Path to the folder containing image files.
        labels_path (str): Path to the folder containing label files.
        output_path (str): Path to the output folder where `train` and `val` subfolders will be created.
        train_ratio (float): Proportion of data to use for training. The rest goes to validation.
        seed (int): Random seed for reproducibility.
    """
    # Ensure reproducibility
    random.seed(seed)

    # Get sorted lists of image and label files
    images = sorted([f for f in os.listdir(images_path) if f.endswith(".png")])
    labels = sorted([f for f in os.listdir(labels_path) if f.endswith(".txt")])

    # Match images and labels by base filenames
    image_basenames = set(os.path.splitext(f)[0] for f in images)
    label_basenames = set(os.path.splitext(f)[0] for f in labels)

    # Only include pairs where both image and label exist
    common_basenames = image_basenames & label_basenames
    images = [f for f in images if os.path.splitext(f)[0] in common_basenames]
    labels = [f for f in labels if os.path.splitext(f)[0] in common_basenames]

    # Pair images and labels to ensure consistency
    data_pairs = list(zip(images, labels))

    # Shuffle data
    random.shuffle(data_pairs)

    # Split data into training and validation sets
    split_idx = int(len(data_pairs) * train_ratio)
    train_pairs = data_pairs[:split_idx]
    val_pairs = data_pairs[split_idx:]

    # Create output directories
    train_images_dir = os.path.join(output_path, "train", "images")
    train_labels_dir = os.path.join(output_path, "train", "labels")
    val_images_dir = os.path.join(output_path, "val", "images")
    val_labels_dir = os.path.join(output_path, "val", "labels")

    for dir_path in [train_images_dir, train_labels_dir, val_images_dir, val_labels_dir]:
        os.makedirs(dir_path, exist_ok=True)

    # Function to copy files to respective directories
    def copy_files(pairs, images_dir, labels_dir):
        for img_file, lbl_file in pairs:
            shutil.copy2(os.path.join(images_path, img_file), images_dir)
            shutil.copy2(os.path.join(labels_path, lbl_file), labels_dir)

    # Copy training and validation files
    copy_files(train_pairs, train_images_dir, train_labels_dir)
    copy_files(val_pairs, val_images_dir, val_labels_dir)

    print(f"Dataset split completed.\nTrain images and labels: {len(train_pairs)}\nValidation images and labels: {len(val_pairs)}")

# Example usage
images_path = "H:/Datasets/VideoPineapple/DJI_0007_V_1_and_0004_V_3/full/images"
labels_path = "H:/Datasets/VideoPineapple/DJI_0007_V_1_and_0004_V_3/full/labels"
output_path = "H:/Datasets/VideoPineapple/DJI_0007_V_1_and_0004_V_3/"
split_yolo_dataset(images_path, labels_path, output_path, train_ratio=0.8, seed=42)

Dataset split completed.
Train images and labels: 692
Validation images and labels: 174
