####**PARTITION UNAUGMENTED DATA**

In [None]:
import os
import json
import random
import shutil
from pathlib import Path

# =========================
# CONFIG
# =========================
base_dir   = Path("/content/data")
images_dir = base_dir / "images"
labels_dir = base_dir / "labels"

SPLIT_RATIOS = {"train": 0.70, "test": 0.15, "val": 0.15}
SEED = 42

# If True: drop samples whose label json has no shapes (no face)
FILTER_EMPTY_LABELS = True

# If True: overwrite existing files in split folders
OVERWRITE = False

# Copy method: "copy" or "symlink" (symlink works on Linux/Colab)
COPY_METHOD = "copy"

IMG_EXTS = {".jpg", ".jpeg", ".png"}

# =========================
# HELPERS
# =========================
def ensure_dirs():
    for split in SPLIT_RATIOS.keys():
        (base_dir / split / "images").mkdir(parents=True, exist_ok=True)
        (base_dir / split / "labels").mkdir(parents=True, exist_ok=True)

def label_has_shapes(label_path: Path) -> bool:
    try:
        with label_path.open("r", encoding="utf-8") as f:
            data = json.load(f)
        shapes = data.get("shapes", [])
        return isinstance(shapes, list) and len(shapes) > 0
    except Exception:
        # If label json is corrupted, treat as invalid
        return False

def safe_link_or_copy(src: Path, dst: Path):
    if dst.exists():
        if OVERWRITE:
            dst.unlink()
        else:
            return

    if COPY_METHOD == "symlink":
        os.symlink(str(src), str(dst))
    else:
        shutil.copy2(src, dst)

def collect_valid_samples():
    samples = []
    missing_labels = 0
    empty_labels = 0

    for img_path in images_dir.iterdir():
        if not img_path.is_file():
            continue
        if img_path.suffix.lower() not in IMG_EXTS:
            continue

        label_path = labels_dir / (img_path.stem + ".json")
        if not label_path.exists():
            missing_labels += 1
            continue

        if FILTER_EMPTY_LABELS and (not label_has_shapes(label_path)):
            empty_labels += 1
            continue

        samples.append((img_path, label_path))

    return samples, missing_labels, empty_labels

def split_samples(samples):
    random.seed(SEED)
    random.shuffle(samples)

    n_total = len(samples)
    n_train = int(SPLIT_RATIOS["train"] * n_total)
    n_test  = int(SPLIT_RATIOS["test"]  * n_total)
    # remainder goes to val to ensure sum == total
    n_val   = n_total - n_train - n_test

    train = samples[:n_train]
    test  = samples[n_train:n_train + n_test]
    val   = samples[n_train + n_test:]

    assert len(train) + len(test) + len(val) == n_total
    return {"train": train, "test": test, "val": val}

def materialize_split(split_name, items):
    img_out_dir = base_dir / split_name / "images"
    lbl_out_dir = base_dir / split_name / "labels"

    for img_path, label_path in items:
        dst_img = img_out_dir / img_path.name
        dst_lbl = lbl_out_dir / label_path.name
        safe_link_or_copy(img_path, dst_img)
        safe_link_or_copy(label_path, dst_lbl)

def write_manifest(splits_dict):
    # Optional: write split file lists for reproducibility/debugging
    manifest_dir = base_dir / "splits_manifest"
    manifest_dir.mkdir(exist_ok=True)

    for split_name, items in splits_dict.items():
        out = manifest_dir / f"{split_name}.txt"
        with out.open("w", encoding="utf-8") as f:
            for img_path, _ in items:
                f.write(img_path.name + "\n")

# =========================
# RUN
# =========================
ensure_dirs()

samples, missing_labels, empty_labels = collect_valid_samples()
print(f"Found images dir: {images_dir} (exists={images_dir.exists()})")
print(f"Found labels dir: {labels_dir} (exists={labels_dir.exists()})")
print(f"Total valid samples (image+label matched"
      f"{' + non-empty shapes' if FILTER_EMPTY_LABELS else ''}): {len(samples)}")
print(f"Skipped (missing label): {missing_labels}")
print(f"Skipped (empty/corrupt label): {empty_labels}")

splits_dict = split_samples(samples)
for k, v in splits_dict.items():
    print(f"{k}: {len(v)}")

for split_name, items in splits_dict.items():
    materialize_split(split_name, items)

write_manifest(splits_dict)
print("Done. Splits created under:", base_dir)

In [None]:
for folder in ['train', 'test', 'val']:
  for file in os.listdir(os.path.join('data', folder, 'images')):
    filename = file.split('.')[0]+'.json'
    existing_filepath = os.path.join('data', 'labels', filename)
    if os.path.exists(existing_filepath):
      new_filepath = os.path.join('data', folder, 'labels', filename)
      os.replace(existing_filepath, new_filepath)