In [1]:
# Importing the required libraries
import os
import shutil
import random
import re
from collections import defaultdict
from pathlib import Path

# Setting a seed so that the dataset splitting is reproducible every time we run the script
random.seed(42)

# Defining the split ratio for train, validation, and test sets
split_ratio = [0.7, 0.15, 0.15] #(70% for training, 15% for validation and 15% for testing) 
splits = ['train', 'val', 'test']

# List of animal species. The script will look for these names as prefixes in image filenames
species_list = ["AmurTiger", "AmurLeopard", "LeopardCat", "RedFox", "Weasel", "WildBoar"]

# Defining paths to the Day and Night datasets
datasets = {
    "dataset_day": {
        "images_dir": "Day/JPEGImages",   # Directory containing daytime images
        "labels_dir": "Day/labels"        # Directory containing corresponding label files
    },
    "dataset_night": {
        "images_dir": "Night/JPEGImages", # Directory containing night images
        "labels_dir": "Night/labels"      # Directory containing corresponding label files
    }
}

# Function to detect which species a given image belongs to based on its filename
def get_species(filename):
    for species in species_list:
        # Checking if filename starts with the species name (case-insensitive)
        if filename.lower().startswith(species.lower()):
            return species
        # If filename is like "Species_123.jpg" or "Species (2).png" we will consider that too
        if re.match(rf"^{species}[\s_\(-]", filename, re.IGNORECASE):
            return species
    # If it doesn't match any species, return None
    return None

# Looping through both Day and Night datasets
for dataset_name, paths in datasets.items():
    images_dir = paths["images_dir"]
    labels_dir = paths["labels_dir"]
    output_dir = dataset_name  # This will be the parent directory for organized splits

    print(f"\n➡️ Processing {dataset_name}...")

    # Creating output folders for train, val, and test splits (both for images and labels)
    for split in splits:
        os.makedirs(os.path.join(output_dir, 'images', split), exist_ok=True)
        os.makedirs(os.path.join(output_dir, 'labels', split), exist_ok=True)

    # Collecting all image files and grouping them by species
    species_images = defaultdict(list)
    all_images = [f for f in os.listdir(images_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]

    # Assigning each image to a species group using the get_species function we made above
    for fname in all_images:
        species = get_species(fname)
        if species:
            species_images[species].append(fname)

    # Going through each species and split its images into train/val/test sets
    for species, images in species_images.items():
        random.shuffle(images)  # Shuffle to make the splits random
        total = len(images)

        # Calculating how many images go into each split
        n_train = int(total * split_ratio[0])
        n_val = int(total * split_ratio[1])
        n_test = total - n_train - n_val  # Remainder goes to test

        # Creating a mapping of split name to the list of images
        split_map = {
            'train': images[:n_train],
            'val': images[n_train:n_train + n_val],
            'test': images[n_train + n_val:]
        }

        # Copying each image and its label to the respective split folder
        for split, split_imgs in split_map.items():
            for img in split_imgs:
                # Copying image file
                src_img = os.path.join(images_dir, img)
                dst_img = os.path.join(output_dir, 'images', split, img)
                shutil.copyfile(src_img, dst_img)

                # Copying label file (if it exists)
                label_name = Path(img).with_suffix('.txt').name  # Replacing image extension with .txt
                src_label = os.path.join(labels_dir, label_name)
                dst_label = os.path.join(output_dir, 'labels', split, label_name)
                if os.path.exists(src_label):
                    shutil.copyfile(src_label, dst_label)

        # Printing a quick summary for this species
        print(f"✔️ {species} split done — Train: {n_train}, Val: {n_val}, Test: {n_test}")

print("\n✅ Splitting completed for the day and night datasets.")



➡️ Processing dataset_day...
✔️ AmurLeopard split done — Train: 489, Val: 105, Test: 106
✔️ AmurTiger split done — Train: 489, Val: 105, Test: 106
✔️ LeopardCat split done — Train: 489, Val: 105, Test: 106
✔️ RedFox split done — Train: 489, Val: 105, Test: 106
✔️ Weasel split done — Train: 489, Val: 105, Test: 106
✔️ WildBoar split done — Train: 489, Val: 105, Test: 106

➡️ Processing dataset_night...
✔️ AmurLeopard split done — Train: 350, Val: 75, Test: 75
✔️ AmurTiger split done — Train: 350, Val: 75, Test: 75
✔️ LeopardCat split done — Train: 350, Val: 75, Test: 75
✔️ RedFox split done — Train: 350, Val: 75, Test: 75
✔️ Weasel split done — Train: 350, Val: 75, Test: 75
✔️ WildBoar split done — Train: 350, Val: 75, Test: 75

✅ Splitting completed for the day and night datasets.
