In [1]:
import os
import sys
from pathlib import Path

project_root = Path.cwd().parent 
sys.path.append(str(project_root))
print(f"Project root: {project_root}")

target_dir = project_root / 'data' 


"""if not target_dir.exists():
    print(f"Error: The directory {target_dir} does not exist.")
else:
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(target_dir):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                try:
                    os.remove(file_path)
                    print(f"Deleted: {file_path}")
                except Exception as e:
                    print(f"Error deleting {file_path}: {e}")"""

Project root: /home/litwin/tiger-fox-elephant


'if not target_dir.exists():\n    print(f"Error: The directory {target_dir} does not exist.")\nelse:\n    # Walk through all directories and subdirectories\n    for root, dirs, files in os.walk(target_dir):\n        for file in files:\n            if file.endswith(\'.txt\'):\n                file_path = os.path.join(root, file)\n                try:\n                    os.remove(file_path)\n                    print(f"Deleted: {file_path}")\n                except Exception as e:\n                    print(f"Error deleting {file_path}: {e}")'

In [2]:
new_data_dir = project_root / 'data' / 'resized_data' 
if not new_data_dir.exists():
    print(f"Error: {new_data_dir} does not exist.")
else:
    animal_counts = {}

    for animal_dir in new_data_dir.iterdir():
        if animal_dir.is_dir():
            num_files = len([f for f in animal_dir.iterdir() if f.is_file()])
            animal_counts[animal_dir.name] = num_files

    print("Number of files per animal folder:")
    for animal, count in sorted(animal_counts.items()):
        print(f"- {animal}: {count} files")

    total_files = sum(animal_counts.values())
    print(f"\nTotal files across all folders: {total_files}")

Number of files per animal folder:
- badger: 1556 files
- bird: 1528 files
- buffalo: 376 files
- butterfly: 2112 files
- cat: 2852 files
- cow: 1866 files
- dog: 4967 files
- elephant: 12037 files
- fox: 6499 files
- hen: 3098 files
- horse: 2623 files
- mouse: 570 files
- rabbit: 938 files
- rhino: 376 files
- sheep: 1820 files
- spider: 4821 files
- squirrel: 1862 files
- tiger: 6976 files
- zebra: 376 files

Total files across all folders: 57253


In [3]:
from tqdm import tqdm

# ----------------------------- Configuration -----------------------------

# Path to the input directory containing class subdirectories
input_dir  = project_root / 'data' / 'resized_data'
# Supported image file extensions
supported_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']

# ------------------------------ Renaming Function -------------------------

def rename_images_with_class_names(input_dir):
    """
    Renames images by prefixing them with their class names in the format:
    className_XX.ext (e.g., class1_01.jpg)

    Args:
        input_dir (Path): Path to the input directory containing class subdirectories.
    """
    # Check if input directory exists
    if not input_dir.exists() or not input_dir.is_dir():
        print(f"Error: Input directory '{input_dir}' does not exist or is not a directory.")
        return

    # Iterate over each class directory
    class_dirs = [d for d in input_dir.iterdir() if d.is_dir()]
    if not class_dirs:
        print(f"No class subdirectories found in '{input_dir}'.")
        return

    for class_dir in class_dirs:
        class_name = class_dir.name
        print(f"\nProcessing class: '{class_name}'")

        # Gather all supported image files in the class directory
        image_files = [f for f in class_dir.iterdir()
                       if f.is_file() and f.suffix.lower() in supported_extensions]

        if not image_files:
            print(f"No supported image files found in '{class_dir}'. Skipping this class.")
            continue

        # Sort the image files to ensure consistent ordering
        image_files.sort()

        # Iterate and rename with a progress bar
        for idx, image_file in enumerate(tqdm(image_files, desc=f"Renaming in {class_name}", unit="image"), start=1):
            # Create new filename with class prefix and two-digit numbering
            new_filename = f"{class_name}_{idx:02d}{image_file.suffix.lower()}"
            new_file_path = class_dir / new_filename

            # Check if the new filename already exists to avoid overwriting
            if new_file_path.exists():
                print(f"Skipped: '{new_filename}' already exists in '{class_dir}'.")
                continue

            # Rename the file
            try:
                image_file.rename(new_file_path)
                # Optionally, print the renaming action
                # print(f"Renamed: '{image_file.name}' to '{new_filename}'")
            except Exception as e:
                print(f"Error renaming '{image_file.name}': {e}")

    print("\nRenaming process completed.")

# ------------------------------- Execution -------------------------------

# Call the renaming function
rename_images_with_class_names(input_dir)



Processing class: 'zebra'


Renaming in zebra: 100%|██████████| 376/376 [00:00<00:00, 79384.79image/s]


Skipped: 'zebra_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/zebra'.
Skipped: 'zebra_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_dat

Renaming in cat: 100%|██████████| 2852/2852 [00:00<00:00, 82478.85image/s]


Skipped: 'cat_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_1084.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cat'.
Skipped: 'cat_1085.jpg' alre

Renaming in buffalo: 100%|██████████| 376/376 [00:00<00:00, 82920.15image/s]


Skipped: 'buffalo_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/buffalo'.
Skipped: 'buffalo_10.jpg' already exists in '/home/litw

Renaming in rabbit: 100%|██████████| 938/938 [00:00<00:00, 87503.77image/s]


Skipped: 'rabbit_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rabbit'.
Skipped: 'rabbit_10.jpg' already exists in '/home/litwin/tiger-fox-elepha

Renaming in sheep: 100%|██████████| 1820/1820 [00:00<00:00, 109565.30image/s]

Skipped: 'sheep_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/sheep'.
Skipped: 'sheep_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_dat


Renaming in elephant:   0%|          | 0/12037 [00:00<?, ?image/s]

Skipped: 'elephant_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_10.jpg' already ex

Renaming in elephant: 100%|██████████| 12037/12037 [00:00<00:00, 85859.05image/s]


Skipped: 'elephant_4972.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_4973.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_4974.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_4975.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_4976.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_4977.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_4978.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_4979.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_4980.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/elephant'.
Skipped: 'elephant_

Renaming in mouse: 100%|██████████| 570/570 [00:00<00:00, 101080.39image/s]


Skipped: 'mouse_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/mouse'.
Skipped: 'mouse_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_dat

Renaming in cow:   0%|          | 0/1866 [00:00<?, ?image/s]

Skipped: 'cow_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_11.jpg' already 

Renaming in cow: 100%|██████████| 1866/1866 [00:00<00:00, 41398.38image/s]


Skipped: 'cow_1179.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1180.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1181.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1182.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1183.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1184.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1185.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1186.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1187.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: 'cow_1188.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/cow'.
Skipped: '

Renaming in horse:   0%|          | 0/2623 [00:00<?, ?image/s]

Skipped: 'horse_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_dat

Renaming in horse: 100%|██████████| 2623/2623 [00:00<00:00, 46965.06image/s]


Skipped: 'horse_2075.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2076.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2077.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2078.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2079.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2080.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2081.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2082.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2083.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/horse'.
Skipped: 'horse_2084.jpg' already exists in '/home/litwin/tiger-fox-eleph

Renaming in spider:   0%|          | 0/4821 [00:00<?, ?image/s]

Skipped: 'spider_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/spider'.
Skipped: 'spider_10.jpg' already exists in '/home/litwin/tiger-fox-elepha

Renaming in spider: 100%|██████████| 4821/4821 [00:00<00:00, 97224.44image/s]



Processing class: 'rhino'


Renaming in rhino: 100%|██████████| 376/376 [00:00<00:00, 95974.82image/s]


Skipped: 'rhino_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/rhino'.
Skipped: 'rhino_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_dat

Renaming in squirrel:   0%|          | 0/1862 [00:00<?, ?image/s]

Skipped: 'squirrel_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_10.jpg' already ex

Renaming in squirrel: 100%|██████████| 1862/1862 [00:00<00:00, 95928.10image/s]

Skipped: 'squirrel_1786.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_1787.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_1788.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_1789.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_1790.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_1791.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_1792.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_1793.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_1794.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/squirrel'.
Skipped: 'squirrel_


Renaming in fox:   0%|          | 0/6499 [00:00<?, ?image/s]

Skipped: 'fox_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_11.jpg' already 

Renaming in fox: 100%|██████████| 6499/6499 [00:00<00:00, 98495.69image/s]

Skipped: 'fox_6118.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6119.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6120.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6121.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6122.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6123.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6124.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6125.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6126.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: 'fox_6127.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/fox'.
Skipped: '


Renaming in tiger:   0%|          | 0/6976 [00:00<?, ?image/s]

Skipped: 'tiger_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_dat

Renaming in tiger: 100%|██████████| 6976/6976 [00:00<00:00, 105058.27image/s]


Skipped: 'tiger_6319.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6320.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6321.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6322.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6323.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6324.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6325.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6326.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6327.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/tiger'.
Skipped: 'tiger_6328.jpg' already exists in '/home/litwin/tiger-fox-eleph

Renaming in dog:   0%|          | 0/4967 [00:00<?, ?image/s]

Skipped: 'dog_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_11.jpg' already 

Renaming in dog: 100%|██████████| 4967/4967 [00:00<00:00, 102735.46image/s]


Skipped: 'dog_866.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_867.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_868.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_869.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_870.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_871.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_872.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_873.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_874.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_875.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/dog'.
Skipped: 'dog_876.jp

Renaming in hen:   0%|          | 0/3098 [00:00<?, ?image/s]

Skipped: 'hen_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_11.jpg' already 

Renaming in hen: 100%|██████████| 3098/3098 [00:00<00:00, 75555.03image/s]


Skipped: 'hen_1892.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1893.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1894.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1895.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1896.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1897.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1898.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1899.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1900.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: 'hen_1901.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/hen'.
Skipped: '

Renaming in butterfly: 100%|██████████| 2112/2112 [00:00<00:00, 105996.87image/s]

Skipped: 'butterfly_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/butterfly'.
Skipped: 'butterfly


Renaming in bird: 100%|██████████| 1528/1528 [00:00<00:00, 89795.67image/s]


Skipped: 'bird_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: 'bird_10.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/bird'.
Skipped: '

Renaming in badger:   0%|          | 0/1556 [00:00<?, ?image/s]

Skipped: 'badger_01.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_02.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_03.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_04.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_05.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_06.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_07.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_08.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_09.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_10.jpg' already exists in '/home/litwin/tiger-fox-elepha

Renaming in badger: 100%|██████████| 1556/1556 [00:00<00:00, 89213.67image/s]

Skipped: 'badger_543.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_544.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_545.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_546.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_547.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_548.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_549.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_550.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_551.jpg' already exists in '/home/litwin/tiger-fox-elephant/data/resized_data/badger'.
Skipped: 'badger_552.jpg' already exists in '/home/litwin/tiger-




In [4]:
from tqdm import tqdm
import shutil
from sklearn.model_selection import train_test_split

target_classes = ['fox', 'elephant', 'tiger']

input_dir  = project_root / 'data' / 'resized_data'
output_dir = project_root / 'data' / 'data/resized_and_split'

# Define split ratios
train_ratio = 0.7
validation_ratio = 0.2
test_ratio = 0.1


supported_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']

# ----------------------------- Helper Functions -------------------------

def get_image_files(directory):
    """Retrieve all image files in a directory with supported extensions."""
    return [f for f in directory.iterdir() if f.is_file() and f.suffix.lower() in supported_extensions]

def create_directory(path):
    """Create a directory if it doesn't exist."""
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

def copy_files(file_list, destination):
    """Copy a list of files to the destination directory."""
    for file_path in file_list:
        shutil.copy(file_path, destination)

# ------------------------------- Splitting Function ----------------------

def split_data_for_class(target_class, all_classes, input_dir, output_dir,
                        train_ratio, validation_ratio, test_ratio):
    """
    Split data for a specific class into train, validation, and test sets.
    
    Args:
        target_class (str): The class for which to create the binary classifier.
        all_classes (list): List of all classes.
        input_dir (Path): Path to the input data directory.
        output_dir (Path): Path to the output split directory.
        train_ratio (float): Proportion of data to use for training.
        validation_ratio (float): Proportion of data to use for validation.
        test_ratio (float): Proportion of data to use for testing.
    """
    # Progress bar for each class
    logger = tqdm(total=3, desc=f"Processing class: {target_class}", leave=False)
    
    # Paths
    target_class_dir = input_dir / target_class
    positive_files = get_image_files(target_class_dir)
    
    # Negative files: all files not in the target class
    negative_files = []
    for cls in all_classes:
        if cls != target_class:
            cls_dir = input_dir / cls
            negative_files.extend(get_image_files(cls_dir))
    
    logger.update(1)
    
    # Labels: 1 for positive, 0 for negative
    # Combine and shuffle
    combined_files = positive_files + negative_files
    labels = [1] * len(positive_files) + [0] * len(negative_files)
    
    # Split into train + temp
    X_train, X_temp, y_train, y_temp = train_test_split(
        combined_files, labels, stratify=labels, test_size=(1 - train_ratio), random_state=42
    )
    
    # Further split temp into validation and test
    relative_val = validation_ratio / (validation_ratio + test_ratio)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, stratify=y_temp, test_size=(test_ratio / (validation_ratio + test_ratio)), random_state=42
    )
    
    logger.update(1)
    
    # Define output paths
    splits = {
        'train': {'positive': [], 'negative': []},
        'validation': {'positive': [], 'negative': []},
        'test': {'positive': [], 'negative': []}
    }
    
    for file, label in zip(X_train, y_train):
        if label == 1:
            splits['train']['positive'].append(file)
        else:
            splits['train']['negative'].append(file)
    
    for file, label in zip(X_val, y_val):
        if label == 1:
            splits['validation']['positive'].append(file)
        else:
            splits['validation']['negative'].append(file)
    
    for file, label in zip(X_test, y_test):
        if label == 1:
            splits['test']['positive'].append(file)
        else:
            splits['test']['negative'].append(file)
    
    logger.update(1)
    
    # Copy files to respective directories
    for split_name, categories in splits.items():
        for category, files in categories.items():
            dest_dir = output_dir / target_class / split_name / category
            create_directory(dest_dir)
            copy_files(files, dest_dir)
    
    logger.update(1)
    logger.close()

# ------------------------------- Execution -------------------------------

# Get all class directories
all_classes = [d.name for d in input_dir.iterdir() if d.is_dir()]

for cls in target_classes:
    split_data_for_class(cls, all_classes, input_dir, output_dir,
                        train_ratio, validation_ratio, test_ratio)

print("Train/Test/Validation splits created successfully for all classes.")


                                                                         

Train/Test/Validation splits created successfully for all classes.




In [5]:
import pandas as pd

# ----------------------------- Configuration -----------------------------

# Define the target classes for binary classifiers
target_classes = ['fox', 'elephant', 'tiger']

# Path to the split data directory
split_dir = project_root / 'data' / 'resized_and_split'

# Supported image file extensions
supported_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']

# Define the splits and categories
splits = ['train', 'validation', 'test']
categories = ['positive', 'negative']

# ------------------------------ Counting Function ------------------------

def count_files(split_dir, target_classes, splits, categories, supported_extensions):
    """
    Counts the number of image files in each split and category for each target class.

    Args:
        split_dir (Path): Path to the split data directory.
        target_classes (list): List of target class names.
        splits (list): List of split names (e.g., 'train', 'validation', 'test').
        categories (list): List of category names (e.g., 'positive', 'negative').
        supported_extensions (list): List of supported image file extensions.

    Returns:
        pd.DataFrame: DataFrame containing counts for each class, split, and category.
    """
    # Initialize a list to store results
    results = []

    # Iterate over each target class
    for cls in tqdm(target_classes, desc="Processing classes", unit="class"):
        cls_dir = split_dir / cls
        if not cls_dir.exists():
            print(f"Warning: Directory '{cls_dir}' does not exist. Skipping class '{cls}'.")
            continue

        # Iterate over each split
        for split in splits:
            split_path = cls_dir / split
            if not split_path.exists():
                print(f"Warning: Split directory '{split_path}' does not exist for class '{cls}'. Skipping split '{split}'.")
                continue

            # Iterate over each category
            for category in categories:
                category_path = split_path / category
                if not category_path.exists():
                    print(f"Warning: Category directory '{category_path}' does not exist for class '{cls}', split '{split}'. Skipping category '{category}'.")
                    continue

                # Count the number of image files with supported extensions
                file_count = sum(1 for f in category_path.iterdir()
                                 if f.is_file() and f.suffix.lower() in supported_extensions)

                # Append the result
                results.append({
                    'Class': cls,
                    'Split': split,
                    'Category': category,
                    'Number of Files': file_count
                })

    # Create a DataFrame from the results
    df = pd.DataFrame(results)

    # Pivot the DataFrame for better readability
    df_pivot = df.pivot_table(index=['Class', 'Split'],
                              columns='Category',
                              values='Number of Files',
                              fill_value=0).reset_index()

    # Optional: Reorder columns
    df_pivot = df_pivot[['Class', 'Split', 'positive', 'negative']]

    return df_pivot

# ------------------------------- Execution -------------------------------

# Call the counting function
df_counts = count_files(split_dir, target_classes, splits, categories, supported_extensions)

# Display the counts
display(df_counts)

# Calculate and display total counts per class and split
df_totals = df_counts.copy()
df_totals['Total'] = df_totals['positive'] + df_totals['negative']
display(df_totals)

# Calculate overall totals
overall_totals = {
    'Class': 'All Classes',
    'Split': 'All Splits',
    'positive': df_counts['positive'].sum(),
    'negative': df_counts['negative'].sum(),
    'Total': df_totals['Total'].sum()
}
df_overall = pd.DataFrame([overall_totals])
display(df_overall)


Processing classes: 100%|██████████| 3/3 [00:00<00:00, 22836.50class/s]




KeyError: 'Number of Files'

In [12]:

import random

# ----------------------------- Configuration -----------------------------

# Path to the split data directory
split_dir = project_root / 'data' / 'resized_and_split'

# Target classes for which to balance negative samples
target_classes = ['fox', 'elephant', 'tiger']

# Splits and categories
splits = ['train', 'validation', 'test']
categories = ['positive', 'negative']

# Supported image file extensions
supported_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']

# Set a random seed for reproducibility
random.seed(42)

# ----------------------------- Helper Functions -------------------------

def get_file_list(directory):
    """Retrieve all image files in a directory with supported extensions."""
    return [f for f in directory.iterdir() if f.is_file() and f.suffix.lower() in supported_extensions]

def balance_negatives(target_class, split, split_dir):
    """
    Balance the number of negative samples to match the number of positive samples.
    
    Args:
        target_class (str): The class for which to balance negatives.
        split (str): The data split ('train', 'validation', 'test').
        split_dir (Path): Path to the split data directory.
    """
    positive_dir = split_dir / target_class / split / 'positive'
    negative_dir = split_dir / target_class / split / 'negative'
    
    # Check if directories exist
    if not positive_dir.exists():
        print(f"Warning: Positive directory '{positive_dir}' does not exist. Skipping.")
        return
    if not negative_dir.exists():
        print(f"Warning: Negative directory '{negative_dir}' does not exist. Skipping.")
        return
    
    # Get counts
    positive_files = get_file_list(positive_dir)
    negative_files = get_file_list(negative_dir)
    
    num_positive = len(positive_files)
    num_negative = len(negative_files)
    
    print(f"\nClass: {target_class} | Split: {split}")
    print(f"Positive samples: {num_positive}")
    print(f"Negative samples: {num_negative}")
    
    if num_negative <= num_positive:
        print("No balancing needed. Negative samples are already less than or equal to positive samples.")
        return
    
    # Determine number of negatives to keep
    num_negatives_to_keep = num_positive
    
    # Randomly select negatives to keep
    negatives_to_keep = set(random.sample(negative_files, num_negatives_to_keep))
    
    # Determine negatives to delete
    negatives_to_delete = [f for f in negative_files if f not in negatives_to_keep]
    
    print(f"Deleting {len(negatives_to_delete)} negative samples to balance the dataset.")
    
    # Delete the selected negatives
    for file_path in tqdm(negatives_to_delete, desc=f"Deleting negatives for {target_class} - {split}", unit="file"):
        try:
            file_path.unlink()
        except Exception as e:
            print(f"Error deleting file '{file_path}': {e}")
    
    # Final counts after deletion
    final_negatives = len(get_file_list(negative_dir))
    print(f"Final negative samples: {final_negatives}")

# ------------------------------- Execution -------------------------------

# Iterate over each target class and split to balance negatives
for cls in target_classes:
    for split in splits:
        balance_negatives(cls, split, split_dir)

print("\nBalancing of negative samples completed for all specified classes and splits.")



Class: fox | Split: train
Positive samples: 4549
Negative samples: 35528
Deleting 30979 negative samples to balance the dataset.


Deleting negatives for fox - train: 100%|██████████| 30979/30979 [00:00<00:00, 89305.59file/s]


Final negative samples: 4549

Class: fox | Split: validation
Positive samples: 1300
Negative samples: 10150
Deleting 8850 negative samples to balance the dataset.


Deleting negatives for fox - validation: 100%|██████████| 8850/8850 [00:00<00:00, 134777.92file/s]


Final negative samples: 1300

Class: fox | Split: test
Positive samples: 650
Negative samples: 5076
Deleting 4426 negative samples to balance the dataset.


Deleting negatives for fox - test: 100%|██████████| 4426/4426 [00:00<00:00, 144568.10file/s]


Final negative samples: 650

Class: elephant | Split: train
Positive samples: 8426
Negative samples: 31651
Deleting 23225 negative samples to balance the dataset.


Deleting negatives for elephant - train: 100%|██████████| 23225/23225 [00:00<00:00, 96242.21file/s]


Final negative samples: 8426

Class: elephant | Split: validation
Positive samples: 2407
Negative samples: 9043
Deleting 6636 negative samples to balance the dataset.


Deleting negatives for elephant - validation: 100%|██████████| 6636/6636 [00:00<00:00, 101273.13file/s]


Final negative samples: 2407

Class: elephant | Split: test
Positive samples: 1204
Negative samples: 4522
Deleting 3318 negative samples to balance the dataset.


Deleting negatives for elephant - test: 100%|██████████| 3318/3318 [00:00<00:00, 114576.58file/s]


Final negative samples: 1204

Class: tiger | Split: train
Positive samples: 4883
Negative samples: 35194
Deleting 30311 negative samples to balance the dataset.


Deleting negatives for tiger - train: 100%|██████████| 30311/30311 [00:00<00:00, 93416.05file/s]


Final negative samples: 4883

Class: tiger | Split: validation
Positive samples: 1395
Negative samples: 10055
Deleting 8660 negative samples to balance the dataset.


Deleting negatives for tiger - validation: 100%|██████████| 8660/8660 [00:00<00:00, 109564.71file/s]


Final negative samples: 1395

Class: tiger | Split: test
Positive samples: 698
Negative samples: 5028
Deleting 4330 negative samples to balance the dataset.


Deleting negatives for tiger - test: 100%|██████████| 4330/4330 [00:00<00:00, 101350.70file/s]

Final negative samples: 698

Balancing of negative samples completed for all specified classes and splits.



