This program splits and balances the data into the format expected by PyTorch ImageFolder

This code moves all images into a temporary folder and renames them

In [None]:
import os
import random
import shutil

source_dir = 'C:/Users/fires/Downloads/By Track'  # path to folder of all lableled frames, should contain a folders for each race

output_dir = 'C:/Users/fires/Downloads/AllFrames' # temporary folder
os.makedirs(output_dir, exist_ok=True)

# Traverse the ByTrack folder
for track_folder in os.listdir(source_dir):
    track_path = os.path.join(source_dir, track_folder)
    
    # Check if it's a directory and matches expected structure
    if os.path.isdir(track_path):
        label_folder = os.path.join(track_path, 'labeled_frames')
        if os.path.isdir(label_folder):
            for file in os.listdir(label_folder):
                if file.endswith('.jpg'):
                    src_path = os.path.join(label_folder, file)
                    
                    # Extract track name (strip digits and year if needed)
                    track_name = ''.join([c for c in track_folder if not c.isdigit()])
                    new_filename = f"{track_name}_{file}"
                    # recategorize no car and distant into the same category
                    if "no car" in new_filename:
                        new_filename = new_filename.replace("no car", "distantORnocar", 1)
                    elif "distant" in new_filename:
                        new_filename = new_filename.replace("distant", "distantORnocar", 1)
                    dst_path = os.path.join(output_dir, new_filename)

                    # Copy to temporary folder
                    shutil.copy2(src_path, dst_path)
                    print(f"Moved: {src_path} → {dst_path}")

This cell separates into test/train/val folders (70/15/15) as expected by ImageFolder

In [None]:
# This code chunk is use to split all data, assumes all image in one single folder in the "source_dir" (temporary folder from previous block)
source_dir = output_dir

# Target base directory for split
train_base = os.path.join(source_dir, 'train')
val_base = os.path.join(source_dir, 'val')
test_base = os.path.join(source_dir, 'test')

# Categories to look for in filenames
categories = ["front", "inside", "side", "rear" , "distantORnocar"]

# Set random seed for reproducibility
random.seed(14052025)

# Ensure train and test subfolders exist
for category in categories:
    os.makedirs(os.path.join(train_base, category), exist_ok=True)
    os.makedirs(os.path.join(val_base, category), exist_ok=True)
    os.makedirs(os.path.join(test_base, category), exist_ok=True)

# Process each category
for category in categories:
    # Filter files containing the category name
    matching_files = [f for f in os.listdir(source_dir)
                      if category in f and f.lower().endswith(('.jpg', '.jpeg', '.png'))]

    # Shuffle and split
    random.shuffle(matching_files)
    n_train = int(len(matching_files) * 0.7)
    n_val = int(len(matching_files) * 0.15)
    train_files = matching_files[:n_train]
    val_files = matching_files[n_train:n_train+n_val]
    test_files = matching_files[n_train+n_val:]

    # Move files to appropriate folders
    for f in train_files:
        shutil.move(os.path.join(source_dir, f), os.path.join(train_base, category, f))
    for f in val_files:
        shutil.move(os.path.join(source_dir, f), os.path.join(val_base, category, f))
    for f in test_files:
        shutil.move(os.path.join(source_dir, f), os.path.join(test_base, category, f))

    print(f"Category '{category}': {len(train_files)} train, {len(val_files)} validation, {len(test_files)} test")

print("Done.")


Category 'front': 814 train, 174 validation, 175 test
Category 'inside': 604 train, 129 validation, 130 test
Category 'side': 396 train, 85 validation, 86 test
Category 'rear': 287 train, 61 validation, 62 test
Category 'distantORnocar': 976 train, 209 validation, 210 test
Done.


This cell balances the training data by category

In [3]:
import os
import random
import shutil

balanced_dir = 'C:/Users/fires/Downloads/BalancedData' # path to output folder for balanced data

splits = ['train', 'val', 'test']
categories = ["front", "inside", "side", "rear" , "distantORnocar"]

# Set random seed for reproducibility
random.seed(21052025)

# Ensure Balanced Data directories exist
for split in splits:
    split_dir = os.path.join(output_dir, split)
    for category in categories:
        os.makedirs(os.path.join(balanced_dir, split, category), exist_ok=True)

    # Get the number of images in each category
    category_counts = {}
    for category in categories:
        cat_path = os.path.join(split_dir, category)
        files = [f for f in os.listdir(cat_path) if f.lower().endswith(('.jpg', '.jpeg'))]
        category_counts[category] = len(files)

    # Determine the minimum number across all categories
    min_count = min(category_counts.values())

    # randomly select min_count images for the training set and copy
    for category in categories:
        src_path = os.path.join(split_dir, category)
        dst_path = os.path.join(balanced_dir, split, category)
        files = [f for f in os.listdir(src_path) if f.lower().endswith(('.jpg', '.jpeg'))]
        selected_files = []
        if split == 'train': 
            selected_files = random.sample(files, min_count)
        else:
            selected_files = files

        for f in selected_files:
            shutil.copy2(os.path.join(src_path, f), os.path.join(dst_path, f))

print("Balanced dataset created.")

Balanced dataset created.
