In [2]:
import os, json, shutil, re, random
from pathlib import Path
from collections import defaultdict

# ---------- CONFIG (edit paths only if yours differ) ----------
ROOT = Path(r"F:\smart-crossing\training\data")

OUT_DIR = ROOT / "split"   # merged output here
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Helper to find images dir if dataset uses train/images vs train/
def images_dir_of(split_dir: Path) -> Path:
    return split_dir/"images" if (split_dir/"images").exists() else split_dir

# Datasets to merge (COCO JSON + images dir)
# 1) COCO subset export (all images -> train split by default)
COCO_SUB_JSON  = ROOT / "raw" / "coco_subset_export" / "labels.json"
COCO_SUB_IMGS  = ROOT / "raw" / "coco_subset_export" / "data"

# 2) Roboflow pedestrian lights (has train/valid/test)
PL_ROOT        = ROOT / "raw" / "pedestrian-lights-1"
PL = {
    "train": {"json": PL_ROOT/"train"/"_annotations.coco.json", "imgs": images_dir_of(PL_ROOT/"train")},
    "val":   {"json": (PL_ROOT/"valid"/"_annotations.coco.json" if (PL_ROOT/"valid").exists() else PL_ROOT/"val"/"_annotations.coco.json"),
              "imgs": images_dir_of(PL_ROOT/"valid" if (PL_ROOT/"valid").exists() else PL_ROOT/"val")},
    "test":  {"json": PL_ROOT/"test"/"_annotations.coco.json",  "imgs": images_dir_of(PL_ROOT/"test")},
}

# 3) Roboflow zebra crossings (has train/valid/test)
ZB_ROOT        = ROOT / "raw" / "roboflow_zebra"
ZB = {
    "train": {"json": ZB_ROOT/"train"/"_annotations.coco.json", "imgs": images_dir_of(ZB_ROOT/"train")},
    "val":   {"json": (ZB_ROOT/"valid"/"_annotations.coco.json" if (ZB_ROOT/"valid").exists() else ZB_ROOT/"val"/"_annotations.coco.json"),
              "imgs": images_dir_of(ZB_ROOT/"valid" if (ZB_ROOT/"valid").exists() else ZB_ROOT/"val")},
    "test":  {"json": ZB_ROOT/"test"/"_annotations.coco.json",  "imgs": images_dir_of(ZB_ROOT/"test")},
}

# Put all sources here. COCO subset goes to TRAIN only.
SOURCES = {
    "train": [
        {"name":"coco_sub", "json": COCO_SUB_JSON, "imgs": COCO_SUB_IMGS},
        PL["train"], {"name":"pl_train", **PL["train"]},
        ZB["train"], {"name":"zb_train", **ZB["train"]},
    ],
    "val":   [PL["val"], {"name":"pl_val", **PL["val"]},
              ZB["val"], {"name":"zb_val", **ZB["val"]}],
    "test":  [PL["test"], {"name":"pl_test", **PL["test"]},
              ZB["test"], {"name":"zb_test", **ZB["test"]}],
}
# ---------------------------------------------------------------

def norm_name(name: str) -> str:
    """Normalize category names so 'traffic light' and 'traffic_light' become same."""
    s = name.strip().lower()
    s = re.sub(r"[\s\-]+", "_", s)
    return s

def load_coco(json_path: Path):
    d = json.loads(json_path.read_text(encoding="utf-8"))
    # Ensure required keys exist
    d.setdefault("images", []); d.setdefault("annotations", []); d.setdefault("categories", [])
    return d

def ensure_out_split(split: str):
    (OUT_DIR/split/"images").mkdir(parents=True, exist_ok=True)

# Build unified category map (by name)
unified_name2id = {}
unified_id2name = {}
next_cat_id = 1

def add_categories_from(d):
    global next_cat_id
    for c in d["categories"]:
        cname = norm_name(c["name"])
        if cname not in unified_name2id:
            unified_name2id[cname] = next_cat_id
            unified_id2name[next_cat_id] = c["name"]  # keep original casing of first-seen
            next_cat_id += 1

# First pass: scan all sources to collect category names
for split, items in SOURCES.items():
    for i in range(0, len(items), 2):   # items were added as dict,dict (keep both ways)
        src = items[i]
        if not isinstance(src, dict) or "json" not in src: 
            continue
        jp = Path(src["json"])
        if jp.exists():
            add_categories_from(load_coco(jp))

# Data holders per split
merged = {
    "train": {"images": [], "annotations": [], "categories": []},
    "val":   {"images": [], "annotations": [], "categories": []},
    "test":  {"images": [], "annotations": [], "categories": []},
}
for s in merged:
    merged[s]["categories"] = [{"id": i, "name": unified_id2name[i]} for i in sorted(unified_id2name)]

# Copy + remap
random.seed(123)
for split, items in SOURCES.items():
    ensure_out_split(split)

    next_img_id = 1 + len(merged[split]["images"])
    next_ann_id = 1 + len(merged[split]["annotations"])

    # Some SOURCES entries were duplicated when using dict expansion; filter valid ones
    valid_items = []
    for j in range(0, len(items), 2):
        src = items[j]
        if isinstance(src, dict) and "json" in src:
            valid_items.append(src)

    for idx, src in enumerate(valid_items, 1):
        json_path = Path(src["json"])
        img_dir   = Path(src["imgs"])
        if not json_path.exists():
            print(f"[WARN] Missing JSON: {json_path} (skipped)")
            continue
        if not img_dir.exists():
            print(f"[WARN] Missing images dir: {img_dir} (skipped)")
            continue

        data = load_coco(json_path)
        # Map source cat id -> unified cat id
        src_id2uni = {}
        for c in data["categories"]:
            src_id2uni[c["id"]] = unified_name2id[norm_name(c["name"])]

        # Copy images with unique prefixed names to avoid collisions
        ds_prefix = json_path.parent.parent.name  # e.g., 'coco_subset_export' or dataset split folder
        ds_tag = src.get("name") or ds_prefix

        # Build map src image id -> new image id + filename
        imid_map = {}
        for im in data["images"]:
            old_fname = im["file_name"]
            src_path  = (img_dir / old_fname) if (img_dir / old_fname).exists() else img_dir / Path(old_fname).name
            if not src_path.exists():
                # Some COCO JSONs keep relative paths; try last part only
                src_path = img_dir / Path(old_fname).name
                if not src_path.exists():
                    # Skip missing image
                    continue

            new_fname = f"{ds_tag}__{old_fname}"
            dst_path  = OUT_DIR / split / "images" / new_fname
            if not dst_path.exists():
                shutil.copy2(src_path, dst_path)

            merged[split]["images"].append({
                "id": next_img_id,
                "file_name": new_fname,
                "width": im.get("width"),
                "height": im.get("height"),
            })
            imid_map[im["id"]] = next_img_id
            next_img_id += 1

        # Remap annotations to new ids
        for ann in data["annotations"]:
            sid = ann.get("image_id")
            if sid not in imid_map:
                continue  # image was missing
            merged[split]["annotations"].append({
                "id": next_ann_id,
                "image_id": imid_map[sid],
                "category_id": src_id2uni[ann["category_id"]],
                "bbox": ann.get("bbox", []),
                "area": float(ann.get("area", 0)),
                "iscrowd": int(ann.get("iscrowd", 0)),
                "segmentation": ann.get("segmentation", []),
            })
            next_ann_id += 1

        print(f"[{split}] merged: {json_path.name}  -> images={len(imid_map)}")

# Write COCO JSONs
for split in ["train","val","test"]:
    out_json = OUT_DIR / (f"{split}.json")
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(merged[split], f)
    print(f"✓ Wrote {out_json} | images={len(merged[split]['images'])} annots={len(merged[split]['annotations'])} cats={len(merged[split]['categories'])}")

print("Done. Merged dataset at:", OUT_DIR)


[train] merged: labels.json  -> images=0
[train] merged: _annotations.coco.json  -> images=4155
[train] merged: _annotations.coco.json  -> images=2034
[val] merged: _annotations.coco.json  -> images=307
[val] merged: _annotations.coco.json  -> images=580
[test] merged: _annotations.coco.json  -> images=303
[test] merged: _annotations.coco.json  -> images=281
✓ Wrote F:\smart-crossing\training\data\split\train.json | images=6189 annots=0 cats=0
✓ Wrote F:\smart-crossing\training\data\split\val.json | images=887 annots=0 cats=0
✓ Wrote F:\smart-crossing\training\data\split\test.json | images=584 annots=0 cats=0
Done. Merged dataset at: F:\smart-crossing\training\data\split
