In [1]:
import os
import json
import shutil
import random 
from tqdm import tqdm

def bbox_to_yolo(bbox, img_w, img_h):
    x, y, w, h = bbox
    x_center = x + w / 2
    y_center = y + h / 2
    return [
        x_center / img_w,
        y_center / img_h,
        w/img_w,
        h/img_h
    ]
sub_category_name_to_index = {
    "자동차(불법주정차)": 0,
    "보행자": 1,
    "유모차": 2,
    "전동킥보드": 3,
    "자전거": 4,
    "오토바이": 5,
    "동물": 6,
    "볼라드": 7,
    "분리봉": 8,
    "표지판": 9,
    "자전거보관대": 10,
    "횡단보도": 11,
    "신호등": 12,
    "음수대": 13,
    "매점": 14,
    "화장실": 15

}

In [2]:
# 경로 설정
train_img_root = r"D:/187.자전거도로 주행 데이터/01-1.정식개방데이터/Train/01.원천데이터"
train_label_root = r"D:/187.자전거도로 주행 데이터/01-1.정식개방데이터/Train/02.라벨링데이터"
val_img_root = r"D:/187.자전거도로 주행 데이터/01-1.정식개방데이터/val/01.원천데이터"
val_label_root = r"D:/187.자전거도로 주행 데이터/01-1.정식개방데이터/val/02.라벨링데이터"

# 저장 경로
save_img = "data/images"
save_label = "data/labels"
for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(save_img, split), exist_ok=True)
    os.makedirs(os.path.join(save_label, split), exist_ok=True)

# 모든 train 이미지/라벨 경로 수집
train_img_files = []
for root, _, files in os.walk(train_img_root):
    for f in files:
        if f.lower().endswith(('.jpg', '.png', '.jpeg')):
            train_img_files.append(os.path.join(root, f))
random.shuffle(train_img_files)
n_train = int(len(train_img_files) * 0.7)
train_split = train_img_files[:n_train]
test_split = train_img_files[n_train:]

# val 이미지 경로 수집
val_img_files = []
for root, _, files in os.walk(val_img_root):
    for f in files:
        if f.lower().endswith(('.jpg', '.png', '.jpeg')):
            val_img_files.append(os.path.join(root, f))

In [6]:
def convert_and_save(json_path, save_txt_path, img_w, img_h):
    with open(json_path, encoding="utf-8") as f:
        data = json.load(f)
    lines = []
    for ann in data["annotations"]:
        if ann["drawing"] == "bbox":
            class_name = ann["sub_category_name"]
            class_idx = sub_category_name_to_index.get(class_name, -1)
            if class_idx == -1:
                continue
            x, y, w, h = ann["bbox"]
            yolo_box = bbox_to_yolo([x, y, w, h], img_w, img_h)
            lines.append(f"{class_idx} {' '.join(f'{v:.6f}' for v in yolo_box)}")
    # bbox가 없어도 빈 파일 생성
    with open(save_txt_path, "w") as f:
        f.write("\n".join(lines))

def find_json(label_root, img_filename):
    # img_filename: "xxx.jpg" → "xxx.json" 찾기
    for root, _, files in os.walk(label_root):
        for f in files:
            if f.endswith(".json") and f.replace(".json", "") == img_filename.replace(".jpg", ""):
                return os.path.join(root, f)
    return None

In [7]:
# train
for img_path in tqdm(train_split, desc="train split"):
    fname = os.path.basename(img_path)
    dst_img_path = os.path.join(save_img, "train", fname)
    dst_label_path = os.path.join(save_label, "train", fname.replace(".jpg", ".txt"))
    # 이미지가 이미 있으면 건너뜀
    if os.path.exists(dst_img_path) and os.path.exists(dst_label_path):
        continue
    json_path = find_json(train_label_root, fname)
    if json_path:
        with open(json_path, encoding="utf-8") as f:
            data = json.load(f)
        img_w, img_h = int(data["images"]["width"]), int(data["images"]["height"])
        convert_and_save(json_path, dst_label_path, img_w, img_h)
        shutil.copy(img_path, dst_img_path)



train split: 100%|██████████| 150417/150417 [5:32:47<00:00,  7.53it/s]  



In [8]:
# test
for img_path in tqdm(test_split, desc='test_split'):
    fname = os.path.basename(img_path)
    json_path = find_json(train_label_root, fname)
    if json_path:
        with open(json_path, encoding="utf-8") as f:
            data = json.load(f)
        img_w, img_h = int(data["images"]["width"]), int(data["images"]["height"])
        convert_and_save(json_path, os.path.join(save_label, "test", fname.replace(".jpg", ".txt")), img_w, img_h)
        shutil.copy(img_path, os.path.join(save_img, "test", fname))

# val
for img_path in tqdm(val_img_files,desc='val_split'):
    fname = os.path.basename(img_path)
    json_path = find_json(val_label_root, fname)
    if json_path:
        with open(json_path, encoding="utf-8") as f:
            data = json.load(f)
        img_w, img_h = int(data["images"]["width"]), int(data["images"]["height"])
        convert_and_save(json_path, os.path.join(save_label, "val", fname.replace(".jpg", ".txt")), img_w, img_h)


test_split: 100%|██████████| 64465/64465 [2:18:21<00:00,  7.77it/s]  
test_split: 100%|██████████| 64465/64465 [2:18:21<00:00,  7.77it/s]
val_split: 100%|██████████| 26863/26863 [07:28<00:00, 59.86it/s]
val_split: 100%|██████████| 26863/26863 [07:28<00:00, 59.86it/s]
