In [None]:
import os
import random
import shutil
from pathlib import Path
from collections import defaultdict

# === CONFIGURATION ===
SOURCE_ROOT = Path("image_data\output_2")  # Folder that contains RC directories like RC05216
OUTPUT_DIR = Path("sample_segmented_chars")
SAMPLE_SIZE = 430              # Total number of images to sample
MIN_PER_DIR = 80               # Minimum number of images per directory (e.g. RC05216)
OUTPUT_SUBDIR = OUTPUT_DIR / "training_set"

# === FUNCTIONS ===
def get_all_character_images(root_dir):
    """
    Walks through all RC directories under root_dir and collects PNG file paths.
    Returns: dict {RC_dir_name: [list of image paths]}
    """
    char_img_dict = defaultdict(list)
    for rc_dir in root_dir.iterdir():
        if rc_dir.is_dir() and rc_dir.name.startswith("RC"):
            for root, _, files in os.walk(rc_dir):
                for file in files:
                    if file.lower().endswith(".png"):
                        full_path = Path(root) / file
                        char_img_dict[rc_dir.name].append(full_path)
    return char_img_dict

def sample_balanced_images(char_img_dict, total_samples, min_per_group):
    """
    Randomly samples images ensuring each group has at least min_per_group.
    """
    eligible = {k: v for k, v in char_img_dict.items() if len(v) >= min_per_group}
    if sum(len(v) for v in eligible.values()) < total_samples:
        raise ValueError("Not enough eligible data to sample.")

    selected = []
    remaining = total_samples

    # First ensure minimum per group
    for group, imgs in eligible.items():
        sampled = random.sample(imgs, min_per_group)
        selected.extend(sampled)
        remaining -= min_per_group

    # Fill the rest
    pool = [img for group in eligible for img in eligible[group] if img not in selected]
    selected.extend(random.sample(pool, remaining))
    return selected

def copy_samples_to_output(samples, output_subdir):
    """
    Copies selected sample files to the output directory.
    """
    output_subdir.mkdir(parents=True, exist_ok=True)
    for img_path in samples:
        dest_path = output_subdir / img_path.name
        shutil.copy(img_path, dest_path)


In [None]:
random.seed(42)  # for reproducibility
print("Scanning RC directories under images/...")
char_dict = get_all_character_images(SOURCE_ROOT)

print("Sampling characters...")
sampled_images = sample_balanced_images(char_dict, SAMPLE_SIZE, MIN_PER_DIR)

print(f"Copying {len(sampled_images)} images to {OUTPUT_SUBDIR}...")
copy_samples_to_output(sampled_images, OUTPUT_SUBDIR)

print("Done. Sampled training set is ready.")