In [None]:
import os
import re
import shutil

def clean_species_folders(folder_paths, species_list):
    # Convert species names ("Alectis ciliaris") â†’ ["alectis_ciliaris", ...]
    valid_prefixes = [s.lower().replace(" ", "_") for s in species_list]

    # Pre-compile regex for performance: prefix_<anything>.<ext>
    patterns = [re.compile(f"^{prefix}_.+", re.IGNORECASE) for prefix in valid_prefixes]

    removed = []

    for folder in folder_paths:
        for filename in os.listdir(folder):
            file_path = os.path.join(folder, filename)

            # Skip directories
            if os.path.isdir(file_path):
                continue

            # Check match against valid species prefixes
            if any(p.match(filename) for p in patterns):
                continue  # valid file

            # Otherwise delete
            os.remove(file_path)
            removed.append(file_path)

    return removed

In [None]:
species = ['Alectis ciliaris', 'Aphareus rutilans', 'Caranx ignobilis', 'Caranx lugubris', 'Caranx melampygus', 'Caranx sexfasciatus', 'Chirocentrus dorab', 'Chirocentrus nudus', 'Decapterus macrosoma', 'Elagatis bipinnulata', 'Epinephelus maculatus', 'Epinephelus radiatus', 'Etelis carbunculus', 'Gymnocranius grandoculis', 'Katsuwonus pelamis', 'Lethrinus atkinsoni', 'Lethrinus erythracanthus', 'Lethrinus obsoletus', 'Lethrinus ornatus', 'Lutjanus bohar', 'Lutjanus fulviflamma', 'Lutjanus fulvus', 'Lutjanus gibbus', 'Lutjanus johnii', 'Lutjanus kasmira', 'Lutjanus rivulatus', 'Lutjanus russellii', 'Lutjanus timoriensis', 'Monotaxis grandoculis', 'Psettodes erumei', 'Rastrelliger kanagurta', 'Sardinella albella', 'Scolopsis lineata', 'Scolopsis vosmeri', 'Scomberoides lysan', 'Scomberomorus commerson', 'Seriola dumerili', 'Variola albimarginata'] 

folder = ["../yolo/dataset/test/images/", "../yolo/dataset/train/images/", "../yolo/dataset/valid/images/", "../yolo/dataset/test/labels/", "../yolo/dataset/train/labels/", "../yolo/dataset/valid/labels/"]

removed_files = clean_species_folders(folder, species)
print(f"Removed {len(removed_files)} files:")

In [None]:
import os
import re
from collections import defaultdict

def count_species_in_folders(folder_paths, species_list):
    # Convert species names -> snake_case prefixes
    prefixes = [s.lower().replace(" ", "_") for s in species_list]

    # Create counters
    species_count = defaultdict(int)

    # Make regex lookup for efficiency
    prefix_patterns = {p: re.compile(f"^{p}_.+", re.IGNORECASE) for p in prefixes}

    for folder in folder_paths:
        for filename in os.listdir(folder):
            file_path = os.path.join(folder, filename)

            # Skip directories
            if os.path.isdir(file_path):
                continue

            # Match filename with species prefixes
            for prefix, pattern in prefix_patterns.items():
                if pattern.match(filename):
                    species_count[prefix] += 1
                    break

    # Convert snake_case back to normal species names
    result = {}
    for original_name, prefix in zip(species_list, prefixes):
        result[original_name] = species_count.get(prefix, 0)

    return result

In [None]:
counts = count_species_in_folders(folder, species)

for species_name, count in counts.items():
    print(f"{species_name}: {count}")

print(len(counts))