In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Define paths
images_dir = "Pavement-datasets/rdd2022/India/img"  # Folder containing images
annotations_dir = "Pavement-datasets/rdd2022/India/labels"  # Folder containing .jpg.txt annotations
output_dir = "datasets"

# Define split ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Normalize ratios if necessary
total_ratio = train_ratio + val_ratio + test_ratio
if total_ratio != 1.0:
    train_ratio = train_ratio / total_ratio
    val_ratio = val_ratio / total_ratio
    test_ratio = test_ratio / total_ratio
    print(f"Adjusted Ratios -> Train: {train_ratio:.2f}, Val: {val_ratio:.2f}, Test: {test_ratio:.2f}")

# Create necessary output directories
for subdir in ['images/train', 'images/val', 'images/test', 'labels/train', 'labels/val', 'labels/test']:
    os.makedirs(os.path.join(output_dir, subdir), exist_ok=True)

# Get all image files
image_files = [f for f in os.listdir(images_dir) if f.endswith(('.jpg', '.png'))]

# Match .jpg.txt annotations with images
valid_image_files = []
valid_annotation_files = []

for image_file in image_files:
    annotation_file = f"{image_file}.txt"  # Append `.txt` to the image name
    annotation_path = os.path.join(annotations_dir, annotation_file)
    if os.path.exists(annotation_path):
        valid_image_files.append(image_file)
        valid_annotation_files.append(annotation_file)

# Check if there are valid pairs
if len(valid_image_files) == 0:
    raise ValueError("No valid image-annotation pairs found. Check your dataset.")

# Split into training, validation, and test sets
train_images, remaining_images, train_annotations, remaining_annotations = train_test_split(
    valid_image_files, valid_annotation_files, test_size=(val_ratio + test_ratio), random_state=42
)
val_images, test_images, val_annotations, test_annotations = train_test_split(
    remaining_images, remaining_annotations, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42
)

# Function to move files to appropriate directories
def move_files(image_files, annotation_files, image_dest, label_dest):
    for image_file, annotation_file in zip(image_files, annotation_files):
        # Move images
        shutil.copy(os.path.join(images_dir, image_file), os.path.join(image_dest, image_file))
        # Rename and move annotations
        new_annotation_name = image_file.replace('.jpg', '.txt').replace('.png', '.txt')
        shutil.copy(os.path.join(annotations_dir, annotation_file), os.path.join(label_dest, new_annotation_name))

# Move training files
move_files(train_images, train_annotations, os.path.join(output_dir, 'images/train'), os.path.join(output_dir, 'labels/train'))

# Move validation files
move_files(val_images, val_annotations, os.path.join(output_dir, 'images/val'), os.path.join(output_dir, 'labels/val'))

# Move test files
move_files(test_images, test_annotations, os.path.join(output_dir, 'images/test'), os.path.join(output_dir, 'labels/test'))

print("Dataset successfully organized in YOLO format!")
