In [2]:
# 노트북 공통 설정
import os, sys, pathlib, random, math, json
from typing import List, Dict, Tuple

KITTI_ROOT = "/home/jinjinjara1022/AutonomousDriving/datasets/kitti_object"
TRAIN_DIR  = f"{KITTI_ROOT}/training"
TEST_DIR   = f"{KITTI_ROOT}/testing"   # 없으면 자동으로 건너뜀

print("KITTI_ROOT :", KITTI_ROOT)
print("TRAIN_DIR  :", TRAIN_DIR)
print("TEST_DIR   :", TEST_DIR, "(exists:", os.path.isdir(TEST_DIR), ")")

KITTI_ROOT : /home/jinjinjara1022/AutonomousDriving/datasets/kitti_object
TRAIN_DIR  : /home/jinjinjara1022/AutonomousDriving/datasets/kitti_object/training
TEST_DIR   : /home/jinjinjara1022/AutonomousDriving/datasets/kitti_object/testing (exists: True )


In [3]:
import os, glob

def safe_count(p):
    return len(glob.glob(os.path.join(p, "*")))

counts = {
    "training/image_2": safe_count(f"{TRAIN_DIR}/image_2"),
    "training/label_2": safe_count(f"{TRAIN_DIR}/label_2"),
    "training/calib":   safe_count(f"{TRAIN_DIR}/calib"),
}

if os.path.isdir(TEST_DIR):
    counts.update({
        "testing/image_2": safe_count(f"{TEST_DIR}/image_2"),
        "testing/calib":   safe_count(f"{TEST_DIR}/calib"),
    })

print(json.dumps(counts, indent=2, ensure_ascii=False))
print("※ 기대값: train 7481개, test 7518개 (image_2/calib 기준)")

{
  "training/image_2": 7481,
  "training/label_2": 7481,
  "training/calib": 7481,
  "testing/image_2": 7518,
  "testing/calib": 7518
}
※ 기대값: train 7481개, test 7518개 (image_2/calib 기준)


In [4]:
from pathlib import Path
import random

imagesets_dir = Path(KITTI_ROOT) / "ImageSets"
imagesets_dir.mkdir(parents=True, exist_ok=True)

train_imgs = sorted([p.stem for p in (Path(TRAIN_DIR)/"image_2").glob("*.png")])
assert len(train_imgs) > 0, "training/image_2 비어있음"

random.seed(42)
random.shuffle(train_imgs)
split = int(len(train_imgs)*0.8)

train_ids = sorted(train_imgs[:split])
val_ids   = sorted(train_imgs[split:])

(Path(imagesets_dir/"train.txt")).write_text("\n".join(train_ids) + "\n")
(Path(imagesets_dir/"val.txt")).write_text("\n".join(val_ids) + "\n")

if os.path.isdir(TEST_DIR):
    test_ids = sorted([p.stem for p in (Path(TEST_DIR)/"image_2").glob("*.png")])
    (Path(imagesets_dir/"test.txt")).write_text("\n".join(test_ids) + "\n")

print(f"[✓] ImageSets 생성 완료: train={len(train_ids)}, val={len(val_ids)}",
      f"(test={len(test_ids) if os.path.isdir(TEST_DIR) else 'N/A'})")
print("경로:", imagesets_dir)


[✓] ImageSets 생성 완료: train=5984, val=1497 (test=7518)
경로: /home/jinjinjara1022/AutonomousDriving/datasets/kitti_object/ImageSets


In [5]:
from dataclasses import dataclass

@dataclass
class KittiObject:
    cls: str
    trunc: float
    occ: int
    alpha: float
    bbox: List[float]      # [l,t,r,b]
    dims: List[float]      # [h,w,l]
    loc:  List[float]      # [x,y,z] (camera coord)
    ry:   float            # rotation_y (rad)

def parse_label_file(path:str)->List[KittiObject]:
    objs=[]
    with open(path,'r') as f:
        for line in f:
            line=line.strip()
            if not line: 
                continue
            parts=line.split()
            cls=parts[0]
            if cls=="DontCare":  # 시각화/학습 혼동 방지 위해 제외
                continue
            trunc=float(parts[1]); occ=int(parts[2]); alpha=float(parts[3])
            l,t,r,b=map(float, parts[4:8])
            h,w,l3=map(float, parts[8:11])
            x,y,z=map(float, parts[11:14])
            ry=float(parts[14])
            objs.append(KittiObject(cls,trunc,occ,alpha,[l,t,r,b],[h,w,l3],[x,y,z],ry))
    return objs

# 샘플 확인
sample_label = sorted((Path(TRAIN_DIR)/"label_2").glob("*.txt"))[0]
parsed = parse_label_file(str(sample_label))
print("샘플 라벨:", sample_label.name, "objects:", len(parsed))
if parsed:
    print(parsed[0])


샘플 라벨: 000000.txt objects: 1
KittiObject(cls='Pedestrian', trunc=0.0, occ=0, alpha=-0.2, bbox=[712.4, 143.0, 810.73, 307.92], dims=[1.89, 0.48, 1.2], loc=[1.84, 1.47, 8.41], ry=0.01)


In [6]:
import cv2
from pathlib import Path
import random

out_dir = Path(KITTI_ROOT)/"preview_2d"
out_dir.mkdir(exist_ok=True)

all_ids = [p.stem for p in (Path(TRAIN_DIR)/"image_2").glob("*.png")]
random.seed(0)
sample_ids = random.sample(all_ids, min(10, len(all_ids)))

for sid in sample_ids:
    img_path = Path(TRAIN_DIR)/"image_2"/f"{sid}.png"
    lbl_path = Path(TRAIN_DIR)/"label_2"/f"{sid}.txt"
    if not lbl_path.exists():
        continue
    img = cv2.imread(str(img_path))
    for obj in parse_label_file(str(lbl_path)):
        l,t,r,b = map(int, obj.bbox)
        cv2.rectangle(img,(l,t),(r,b),(0,255,0),2)
        cv2.putText(img, obj.cls, (l,max(15,t-5)), cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, (0,255,0), 1, cv2.LINE_AA)
    cv2.imwrite(str(out_dir/f"{sid}.jpg"), img)

print("[✓] 2D 미리보기 저장:", out_dir)


[✓] 2D 미리보기 저장: /home/jinjinjara1022/AutonomousDriving/datasets/kitti_object/preview_2d


In [7]:
import numpy as np

def read_calib_file(path:str)->Dict[str, np.ndarray]:
    data={}
    with open(path,'r') as f:
        for line in f:
            if ":" not in line: 
                continue
            k, v = line.strip().split(":", 1)
            v = v.strip()
            nums = list(map(float, v.split()))
            if k.startswith("P") and len(nums)==12:
                data[k] = np.array(nums, dtype=np.float32).reshape(3,4)
            elif k in ("R0_rect", "R_rect") and len(nums)==9:
                data[k] = np.array(nums, dtype=np.float32).reshape(3,3)
            elif k=="Tr_velo_to_cam" and len(nums)==12:
                data[k] = np.array(nums, dtype=np.float32).reshape(3,4)
            else:
                # 기타 항목은 필요 시 추가
                try:
                    data[k] = np.array(nums, dtype=np.float32)
                except:
                    data[k] = v
    return data

# 샘플 확인
sample_calib = sorted((Path(TRAIN_DIR)/"calib").glob("*.txt"))[0]
C = read_calib_file(str(sample_calib))
print("keys:", C.keys())
print("P2:\n", C.get("P2"))


keys: dict_keys(['P0', 'P1', 'P2', 'P3', 'R0_rect', 'Tr_velo_to_cam', 'Tr_imu_to_velo'])
P2:
 [[ 7.070493e+02  0.000000e+00  6.040814e+02  4.575831e+01]
 [ 0.000000e+00  7.070493e+02  1.805066e+02 -3.454157e-01]
 [ 0.000000e+00  0.000000e+00  1.000000e+00  4.981016e-03]]


In [10]:
import numpy as np, cv2
from pathlib import Path
import random, math

def compute_box_3d(dim, loc, ry):
    """
    dim = [h, w, l]
    loc = [x, y, z] (camera coord)
    ry: rotation around Y-axis in camera coordinates
    반환: (3,8) 카메라 좌표계의 8개 코너 (x,y,z)
    """
    h, w, l = dim
    # object 좌표계 기준 코너 (KITTI 관례: 바닥중심이 loc)
    x_corners = [ l/2,  l/2, -l/2, -l/2,  l/2,  l/2, -l/2, -l/2]
    y_corners = [   0,    0,    0,    0,  -h,  -h,   -h,   -h]
    z_corners = [ w/2, -w/2, -w/2,  w/2,  w/2, -w/2, -w/2,  w/2]
    corners = np.vstack([x_corners, y_corners, z_corners])  # (3,8)

    # 회전 (Y축)
    R = np.array([[ math.cos(ry), 0, math.sin(ry)],
                  [ 0,            1, 0           ],
                  [-math.sin(ry), 0, math.cos(ry)]], dtype=np.float32)
    corners_rot = R @ corners

    # 이동
    corners_3d = corners_rot + np.array(loc, dtype=np.float32).reshape(3,1)
    return corners_3d

def project_to_image(pts_3d, P):
    """
    pts_3d: (3, N)
    P: (3,4) projection matrix
    return: (2,N) image coords
    """
    n = pts_3d.shape[1]
    homo = np.vstack([pts_3d, np.ones((1,n), dtype=np.float32)])  # (4,N)
    pts_2d_homo = P @ homo  # (3,N)
    pts_2d = pts_2d_homo[:2] / np.clip(pts_2d_homo[2:], 1e-6, None)
    return pts_2d

def draw_projected_box3d(img, qs, color=(0,0,255), thickness=2):
    """
    qs: (2,8) projected corner points
    코너 인덱스 연결 규칙에 따라 12개 에지 그리기
    """
    qs = qs.T.astype(int)  # (8,2)
    # 연결 (윗면/아랫면/수직 에지)
    edges = [(0,1),(1,2),(2,3),(3,0),
             (4,5),(5,6),(6,7),(7,4),
             (0,4),(1,5),(2,6),(3,7)]
    for i,j in edges:
        cv2.line(img, tuple(qs[i]), tuple(qs[j]), color, thickness)
    return img

# 샘플 몇 장 그려보기 (Car/Pedestrian/Cyclist 제한)
out_dir_3d = Path(KITTI_ROOT)/"preview_3d"
out_dir_3d.mkdir(exist_ok=True)

ids = [p.stem for p in (Path(TRAIN_DIR)/"image_2").glob("*.png")]
random.seed(1)
for sid in random.sample(ids, min(10, len(ids))):
    img_path = Path(TRAIN_DIR)/"image_2"/f"{sid}.png"
    lbl_path = Path(TRAIN_DIR)/"label_2"/f"{sid}.txt"
    calib_path = Path(TRAIN_DIR)/"calib"/f"{sid}.txt"
    if not (lbl_path.exists() and calib_path.exists()):
        continue

    img = cv2.imread(str(img_path))
    cal = read_calib_file(str(calib_path))
    P2  = cal["P2"]

    objects = [o for o in parse_label_file(str(lbl_path))
               if o.cls in ("Car","Pedestrian","Cyclist")]

    for o in objects:
        # 2D 박스(초록) + 3D 박스(빨강)
        l,t,r,b = map(int, o.bbox)
        cv2.rectangle(img,(l,t),(r,b),(0,255,0),2)

        # 3D box
        corners_3d = compute_box_3d(o.dims, o.loc, o.ry)
        # z<=0 포인트(카메라 뒤)는 skip
        if (corners_3d[2] <= 0).any():
            continue
        pts_2d = project_to_image(corners_3d, P2)  # (2,8)
        img = draw_projected_box3d(img, pts_2d, (0,0,255), 2)

        # 클래스 라벨
        cv2.putText(img, o.cls, (l,max(15,t-5)), cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, (0,255,0), 1, cv2.LINE_AA)

    cv2.imwrite(str(out_dir_3d/f"{sid}.jpg"), img)

print("[✓] 3D 박스 미리보기 저장:", out_dir_3d)


[✓] 3D 박스 미리보기 저장: /home/jinjinjara1022/AutonomousDriving/datasets/kitti_object/preview_3d


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
import cv2
import numpy as np
from pathlib import Path

class KittiMono3DDataset(Dataset):
    def __init__(self, kitti_root:str, split:str="train"):
        self.kitti_root = Path(kitti_root)
        self.split = split
        ids_path = self.kitti_root/"ImageSets"/f"{split}.txt"
        assert ids_path.exists(), f"{ids_path} 없음"
        self.ids = ids_path.read_text().strip().splitlines()
        self.img_dir = self.kitti_root/"training"/"image_2"
        self.lbl_dir = self.kitti_root/"training"/"label_2"
        self.calib_dir = self.kitti_root/"training"/"calib"

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        sid = self.ids[idx]
        img = cv2.imread(str(self.img_dir/f"{sid}.png"))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w = img.shape[:2]

        labels = parse_label_file(str(self.lbl_dir/f"{sid}.txt"))
        calib  = read_calib_file(str(self.calib_dir/f"{sid}.txt"))
        P2     = calib["P2"]

        # 간단한 텐서 변환 (실제 학습에 맞게 Normalize/Resize 등 추가 예정)
        img_t = torch.from_numpy(img).permute(2,0,1).float()  # (3,H,W)
        meta = {"id": sid, "H": h, "W": w, "P2": P2}

        # 라벨을 텐서로 간이 변환 (학습용 포맷은 이후 모델 설계에 맞춰 변환)
        target = {
            "classes": [o.cls for o in labels],
            "bbox2d":  np.array([o.bbox for o in labels], dtype=np.float32),  # (N,4)
            "dims":    np.array([o.dims for o in labels], dtype=np.float32),  # (N,3)
            "loc":     np.array([o.loc  for o in labels], dtype=np.float32),  # (N,3)
            "ry":      np.array([o.ry   for o in labels], dtype=np.float32),  # (N,)
        }
        return img_t, target, meta

# 데이터셋 빠른 점검
train_ds = KittiMono3DDataset(KITTI_ROOT, "train")
val_ds   = KittiMono3DDataset(KITTI_ROOT, "val")
print("train len:", len(train_ds), "val len:", len(val_ds))

img_t, target, meta = train_ds[0]
print("image:", img_t.shape, "num objs:", len(target["classes"]), "P2 shape:", meta["P2"].shape)


train len: 5984 val len: 1497
image: torch.Size([3, 370, 1224]) num objs: 1 P2 shape: (3, 4)


In [13]:
loader = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=4, collate_fn=lambda x: x)
batch = next(iter(loader))

for i,(img_t, tgt, meta) in enumerate(batch):
    print(f"[{i}] id={meta['id']}  img={tuple(img_t.shape)}  objs={len(tgt['classes'])}")


[0] id=003238  img=(3, 375, 1242)  objs=7
[1] id=006762  img=(3, 375, 1242)  objs=4


In [14]:
from pathlib import Path
import numpy as np
import json, math

CLASSES = ["Car", "Pedestrian", "Cyclist"]

def collect_class_stats(label_dir):
    stats = {c: [] for c in CLASSES}
    for p in sorted(Path(label_dir).glob("*.txt")):
        for o in parse_label_file(str(p)):
            if o.cls in CLASSES:
                stats[o.cls].append(o.dims)  # [h,w,l]
    out = {}
    for c, arr in stats.items():
        if len(arr)==0:
            out[c] = {"mean":[1.0,1.0,1.0], "count":0}
            continue
        a = np.array(arr, dtype=np.float32)
        out[c] = {"mean": a.mean(0).tolist(), "std": a.std(0).tolist(), "count": int(a.shape[0])}
    return out

priors = collect_class_stats(f"{TRAIN_DIR}/label_2")
print(json.dumps(priors, indent=2))
class_to_idx = {c:i for i,c in enumerate(CLASSES)}
priors_arr = np.stack([np.array(priors[c]["mean"], dtype=np.float32) for c in CLASSES], 0)  # (3,3)
priors_arr

{
  "Car": {
    "mean": [
      1.5260810852050781,
      1.6286139488220215,
      3.8839685916900635
    ],
    "std": [
      0.1366989016532898,
      0.10216215252876282,
      0.42591333389282227
    ],
    "count": 28742
  },
  "Pedestrian": {
    "mean": [
      1.7607048749923706,
      0.6601871848106384,
      0.8422839045524597
    ],
    "std": [
      0.11325047165155411,
      0.14265143871307373,
      0.23489883542060852
    ],
    "count": 4487
  },
  "Cyclist": {
    "mean": [
      1.7372057437896729,
      0.5967734456062317,
      1.7635438442230225
    ],
    "std": [
      0.09479568153619766,
      0.12417350709438324,
      0.17660865187644958
    ],
    "count": 1627
  }
}


array([[1.5260811 , 1.628614  , 3.8839686 ],
       [1.7607049 , 0.6601872 , 0.8422839 ],
       [1.7372057 , 0.59677345, 1.7635438 ]], dtype=float32)

In [15]:
import torch
from torch.utils.data import Dataset
import cv2, numpy as np

IMG_SIZE = 128
PADDING_SCALE = 1.2  # bbox에 여유

def crop_resize(img, bbox, out_size=IMG_SIZE, scale=PADDING_SCALE):
    h, w = img.shape[:2]
    l,t,r,b = bbox
    cx, cy = (l+r)/2, (t+b)/2
    bw, bh = (r-l), (b-t)
    s = max(bw, bh) * scale
    x1, y1 = int(cx - s/2), int(cy - s/2)
    x2, y2 = int(cx + s/2), int(cy + s/2)
    # pad if out of bounds
    pad_l = max(0, -x1); pad_t = max(0, -y1)
    pad_r = max(0, x2 - w); pad_b = max(0, y2 - h)
    if any([pad_l,pad_t,pad_r,pad_b]):
        img = cv2.copyMakeBorder(img, pad_t, pad_b, pad_l, pad_r, cv2.BORDER_CONSTANT, value=(0,0,0))
        x1 += pad_l; x2 += pad_l; y1 += pad_t; y2 += pad_t
    crop = img[y1:y2, x1:x2]
    crop = cv2.resize(crop, (out_size, out_size), interpolation=cv2.INTER_LINEAR)
    return crop

IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
IMAGENET_STD  = np.array([0.229, 0.224, 0.225], dtype=np.float32)

class KittiObjectROIs(Dataset):
    def __init__(self, kitti_root:str, split:str="train", classes=CLASSES, max_samples=None):
        self.root = Path(kitti_root)
        self.split = split
        self.classes = set(classes)
        ids = (self.root/"ImageSets"/f"{split}.txt").read_text().strip().splitlines()
        self.img_dir = self.root/"training"/"image_2"
        self.lbl_dir = self.root/"training"/"label_2"
        self.calib_dir = self.root/"training"/"calib"
        self.items = []
        for sid in ids:
            lbl_path = self.lbl_dir/f"{sid}.txt"
            if not lbl_path.exists(): 
                continue
            for o in parse_label_file(str(lbl_path)):
                if o.cls in self.classes:
                    self.items.append((sid, o))
        if max_samples:
            self.items = self.items[:max_samples]
        print(f"[{split}] objects: {len(self.items)}")

    def __len__(self): return len(self.items)

    def __getitem__(self, idx):
        sid, o = self.items[idx]
        img_bgr = cv2.imread(str(self.img_dir/f"{sid}.png"))
        img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
        crop = crop_resize(img, o.bbox, IMG_SIZE, PADDING_SCALE)
        # to tensor (ImageNet norm)
        x = (crop.astype(np.float32)/255.0 - IMAGENET_MEAN)/IMAGENET_STD
        x = torch.from_numpy(x).permute(2,0,1)  # (3,128,128)

        cls_idx = class_to_idx[o.cls]
        prior = torch.from_numpy(priors_arr[cls_idx])  # (3,)
        dims = torch.tensor(o.dims, dtype=torch.float32)
        dims_res = dims - prior

        z = torch.tensor(o.loc[2], dtype=torch.float32)
        logz = torch.log(torch.clamp(z, min=1e-3))

        ry = torch.tensor(o.ry, dtype=torch.float32)
        yaw_tgt = torch.stack([torch.sin(ry), torch.cos(ry)], 0)  # (2,)

        # 2D center (u,v)
        l,t,r,b = o.bbox
        uv = torch.tensor([(l+r)/2.0, (t+b)/2.0], dtype=torch.float32)

        # calib
        P2 = torch.from_numpy(read_calib_file(str(self.calib_dir/f"{sid}.txt"))["P2"])

        return {
            "img": x,
            "cls_idx": torch.tensor(cls_idx, dtype=torch.long),
            "prior": prior,
            "dims_res": dims_res,
            "logz": logz,
            "yaw": yaw_tgt,
            "uv": uv,
            "P2": P2,
            "sid": sid
        }

# 샘플 확인
train_roi = KittiObjectROIs(KITTI_ROOT, "train", max_samples=None)
val_roi   = KittiObjectROIs(KITTI_ROOT, "val", max_samples=None)

sample = train_roi[0]
{ k: (v.shape if hasattr(v, 'shape') else v) for k,v in sample.items() if k not in ('sid',) }


[train] objects: 27974
[val] objects: 6882


{'img': torch.Size([3, 128, 128]),
 'cls_idx': torch.Size([]),
 'prior': torch.Size([3]),
 'dims_res': torch.Size([3]),
 'logz': torch.Size([]),
 'yaw': torch.Size([2]),
 'uv': torch.Size([2]),
 'P2': torch.Size([3, 4])}

In [16]:
import torch
import torch.nn as nn
import torchvision.models as models

class Mono3DHead(nn.Module):
    def __init__(self, feat_dim=512, out_dim=6):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(feat_dim, 256), nn.ReLU(inplace=True),
            nn.Linear(256, 128), nn.ReLU(inplace=True),
            nn.Linear(128, out_dim)
        )
    def forward(self, f):
        return self.fc(f)  # (B,6)

class Mono3DNet(nn.Module):
    def __init__(self):
        super().__init__()
        m = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        # 입력 3x128x128 → resnet18 avgpool → 512
        self.backbone = nn.Sequential(*(list(m.children())[:-1]))  # drop fc
        self.head = Mono3DHead(512, 6)

    def forward(self, x):
        f = self.backbone(x)         # (B, 512, 1, 1)
        f = torch.flatten(f, 1)      # (B, 512)
        out = self.head(f)           # (B, 6)
        dims_res = out[:, 0:3]
        logz     = out[:, 3:4]
        yaw_raw  = out[:, 4:6]       # not normalized yet
        # normalize yaw vector
        yaw_norm = torch.linalg.norm(yaw_raw, dim=1, keepdim=True) + 1e-6
        yaw = yaw_raw / yaw_norm
        return dims_res, logz, yaw  # yaw: (sin,cos) normalized

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Mono3DNet().to(device)
sum(p.numel() for p in model.parameters())/1e6


11.34151

In [17]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    # 각 샘플은 dict. 키별로 스택
    out = {}
    for k in batch[0].keys():
        if isinstance(batch[0][k], torch.Tensor):
            out[k] = torch.stack([b[k] for b in batch], 0)
        else:
            out[k] = [b[k] for b in batch]
    return out

batch_size = 64
train_loader = DataLoader(train_roi, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_roi,   batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate_fn)

# 손실 가중치
W_DIM = 1.0
W_Z   = 1.0
W_YAW = 1.0

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)

def loss_fn(batch, pred):
    dims_res_pred, logz_pred, yaw_pred = pred   # yaw_pred normalized

    dims_res_tgt = batch["dims_res"].to(device)     # (B,3)
    prior        = batch["prior"].to(device)        # (B,3)
    dims_pred    = dims_res_pred + prior            # 최종 dims
    dims_tgt     = dims_res_tgt + prior             # = batch dims

    # L1 on dims
    loss_dims = (dims_pred - dims_tgt).abs().mean()

    # depth on logZ (L1)
    loss_z = (logz_pred.squeeze(1) - batch["logz"].to(device)).abs().mean()

    # yaw pred vs target (MSE on normalized vector)
    loss_yaw = ((yaw_pred - batch["yaw"].to(device))**2).mean()

    loss = W_DIM*loss_dims + W_Z*loss_z + W_YAW*loss_yaw
    return loss, {"loss_dims": loss_dims.item(), "loss_z": loss_z.item(), "loss_yaw": loss_yaw.item()}


In [23]:
from time import time

def train_one_epoch(model, loader, optimizer):
    model.train()
    total, logs = 0.0, {"loss_dims":0.0, "loss_z":0.0, "loss_yaw":0.0}
    for batch in loader:
        batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k,v in batch.items()}
        pred = model(batch["img"])
        loss, d = loss_fn(batch, pred)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total += loss.item()
        for k in logs: logs[k]+= d[k]
    n=len(loader)
    return total/n, {k: v/n for k,v in logs.items()}

@torch.no_grad()
def eval_one_epoch(model, loader):
    model.eval()
    total, logs = 0.0, {"loss_dims":0.0, "loss_z":0.0, "loss_yaw":0.0}
    for batch in loader:
        batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k,v in batch.items()}
        pred = model(batch["img"])
        loss, d = loss_fn(batch, pred)
        total += loss.item()
        for k in logs: logs[k]+= d[k]
    n=len(loader)
    return total/n, {k: v/n for k,v in logs.items()}

EPOCHS = 5
best = 1e9
for ep in range(1, EPOCHS+1):
    t0=time()
    tr, tr_log = train_one_epoch(model, train_loader, optimizer)
    va, va_log = eval_one_epoch(model, val_loader)
    dt=time()-t0
    print(f"[{ep}/{EPOCHS}] train {tr:.4f} (dims {tr_log['loss_dims']:.3f} z {tr_log['loss_z']:.3f} yaw {tr_log['loss_yaw']:.3f}) | "
          f"val {va:.4f} (dims {va_log['loss_dims']:.3f} z {va_log['loss_z']:.3f} yaw {va_log['loss_yaw']:.3f})  [{dt:.1f}s]")
    if va < best:
        best = va
        torch.save(model.state_dict(), "./models/mono3d_baseline.pt")
        print("  -> saved:", "mono3d_baseline.pt")


[1/5] train 0.4841 (dims 0.138 z 0.161 yaw 0.185) | val 0.4125 (dims 0.132 z 0.121 yaw 0.159)  [102.1s]
  -> saved: mono3d_baseline.pt
[2/5] train 0.3889 (dims 0.123 z 0.153 yaw 0.112) | val 0.3873 (dims 0.121 z 0.146 yaw 0.121)  [146.7s]
  -> saved: mono3d_baseline.pt
[3/5] train 0.3422 (dims 0.112 z 0.139 yaw 0.090) | val 0.3378 (dims 0.114 z 0.109 yaw 0.114)  [150.9s]
  -> saved: mono3d_baseline.pt
[4/5] train 0.2956 (dims 0.103 z 0.125 yaw 0.067) | val 0.2968 (dims 0.105 z 0.105 yaw 0.087)  [134.4s]
  -> saved: mono3d_baseline.pt
[5/5] train 0.3307 (dims 0.103 z 0.134 yaw 0.094) | val 0.3100 (dims 0.106 z 0.103 yaw 0.101)  [143.8s]


In [26]:
# === 1) compute_box_3d 축 교정 (x=width, z=length) ===
import numpy as np, cv2
import torch
from math import atan2
from pathlib import Path

def compute_box_3d_fixed(dim, loc, ry):
    """
    dim=[h,w,l], loc=[X,Y,Z] (camera coord), ry: yaw around Y
    return: (3,8) camera coords
    """
    h, w, l = dim
    # x=±w/2, z=±l/2  (※ 기존 코드에서 w/l이 뒤바뀌어 있었음)
    x_c = [ w/2,  w/2, -w/2, -w/2,  w/2,  w/2, -w/2, -w/2]
    y_c = [   0,    0,    0,    0,  -h,  -h,   -h,   -h]
    z_c = [ l/2, -l/2, -l/2,  l/2,  l/2, -l/2, -l/2,  l/2]
    corners = np.vstack([x_c, y_c, z_c]).astype(np.float32)

    c, s = np.cos(ry), np.sin(ry)
    R = np.array([[ c, 0,  s],
                  [ 0, 1,  0],
                  [-s, 0,  c]], dtype=np.float32)
    corners = R @ corners
    corners = corners + np.array(loc, dtype=np.float32).reshape(3,1)
    return corners

# draw 함수는 그대로 사용 가능 (project_to_image, draw_projected_box3d 등)

# === 2) 하단 중앙(uv_bottom) 사용 ===
def uv_center_and_bottom_from_bbox(bbox):
    l,t,r,b = bbox
    u_c, v_c = (l+r)/2.0, (t+b)/2.0
    u_b, v_b = (l+r)/2.0, b
    return (u_c, v_c), (u_b, v_b)

def uvz_to_xyz(uv, Z, P2):
    fx, fy = P2[0,0], P2[1,1]
    cx, cy = P2[0,2], P2[1,2]
    u, v = uv
    X = (u - cx) * Z / fx
    Y = (v - cy) * Z / fy
    return np.array([X, Y, Z], dtype=np.float32)

@torch.no_grad()
def predict_on_samples_fixed(model, dataset, n=8, seed=0, use_bottom=True):
    model.eval()
    rng = np.random.default_rng(seed)
    idxs = rng.choice(len(dataset), size=min(n, len(dataset)), replace=False)
    outs=[]
    for i in idxs:
        item = dataset[i]                        # dict (img, prior, …)
        sid, gt_obj = dataset.items[i]          # ← 같은 dataset에서 GT 객체 접근
        # 모델 추론
        x = item["img"].unsqueeze(0).to(device)
        dims_res, logz, yaw = model(x)
        prior = item["prior"].cpu().numpy()
        dims_pred = (dims_res.squeeze(0).cpu().numpy() + prior)    # [h,w,l]
        Z = float(torch.exp(logz.squeeze()).cpu().numpy())
        yaw_vec = yaw.squeeze(0).cpu().numpy()
        ry = float(atan2(yaw_vec[0], yaw_vec[1]))

        # uv 선택: 바닥 중앙 vs 중심
        bbox = gt_obj.bbox
        (u_c, v_c), (u_b, v_b) = uv_center_and_bottom_from_bbox(bbox)
        uv = (u_b, v_b) if use_bottom else (u_c, v_c)

        P2 = item["P2"].cpu().numpy()
        loc = uvz_to_xyz(uv, Z, P2)
        outs.append((i, dims_pred, loc, ry))
    return outs

# === 3) 올바른 GT 2D 박스 참조 & 3D 투영 ===
def draw_pred_boxes_fixed(dataset, preds, out_dir):
    out_dir = Path(out_dir); out_dir.mkdir(parents=True, exist_ok=True)
    for i, dims, loc, ry in preds:
        sid, gt_obj = dataset.items[i]                         # ← 같은 dataset에서
        img_path = Path(TRAIN_DIR)/"image_2"/f"{sid}.png"
        img = cv2.imread(str(img_path))
        P2 = dataset[i]["P2"].cpu().numpy()                    # from item

        # Pred 3D box
        corners_3d = compute_box_3d_fixed(dims, loc, ry)
        if (corners_3d[2] <= 0).any():
            continue
        pts_2d = project_to_image(corners_3d, P2)
        img = draw_projected_box3d(img, pts_2d, (0,165,255), 2)  # 주황

        # GT 2D box (초록)
        l,t,r,b = map(int, gt_obj.bbox)
        cv2.rectangle(img,(l,t),(r,b),(0,255,0),2)

        cv2.imwrite(str(out_dir/f"{sid}_pred.jpg"), img)
    return out_dir

# 실행
preds = predict_on_samples_fixed(model, val_roi, n=12, seed=1, use_bottom=True)
out_dir = draw_pred_boxes_fixed(val_roi, preds, f"./preview_pred_fixed")
print("saved to:", out_dir)

saved to: preview_pred_fixed


In [30]:
# 프레임별로 예측을 모아서 한 장에 모두 그려 저장
import cv2, numpy as np, torch
from pathlib import Path
from math import atan2

# compute_box_3d_fixed / project_to_image / draw_projected_box3d / read_calib_file
# / uvz_to_xyz / uv_center_and_bottom_from_bbox 가 위 셀에 있다고 가정

from collections import defaultdict

@torch.no_grad()
def predict_grouped_by_sid(model, dataset, use_bottom=True, max_frames=20):
    model.eval()
    # 1) 밸리데이션에 등장하는 프레임 id 목록
    sids = []
    seen = set()
    for sid, _ in dataset.items:
        if sid not in seen:
            seen.add(sid)
            sids.append(sid)
    sids = sids[:max_frames]

    sid_to_preds = defaultdict(list)
    for i,(sid, gt_obj) in enumerate(dataset.items):
        if sid not in set(sids):
            continue
        item = dataset[i]  # dict
        # 추론
        x = item["img"].unsqueeze(0).to(device)
        dims_res, logz, yaw = model(x)
        prior = item["prior"].cpu().numpy()
        dims_pred = (dims_res.squeeze(0).cpu().numpy() + prior)   # [h,w,l]
        Z = float(torch.exp(logz.squeeze()).cpu().numpy())
        yaw_vec = yaw.squeeze(0).cpu().numpy()
        ry = float(atan2(yaw_vec[0], yaw_vec[1]))
        # uv: 바닥 중앙 권장
        (u_c, v_c), (u_b, v_b) = uv_center_and_bottom_from_bbox(gt_obj.bbox)
        uv = (u_b, v_b) if use_bottom else (u_c, v_c)
        P2 = item["P2"].cpu().numpy()
        loc = uvz_to_xyz(uv, Z, P2)
        sid_to_preds[sid].append((dims_pred, loc, ry))
    return sids, sid_to_preds

def draw_grouped_results(dataset, sids, sid_to_preds, out_dir="./preview_pred_grouped"):
    out_dir = Path(out_dir); out_dir.mkdir(parents=True, exist_ok=True)
    for sid in sids:
        img_path  = Path(TRAIN_DIR)/"image_2"/f"{sid}.png"
        lbl_path  = Path(TRAIN_DIR)/"label_2"/f"{sid}.txt"
        calib_path= Path(TRAIN_DIR)/"calib"/f"{sid}.txt"
        img = cv2.imread(str(img_path))
        P2  = read_calib_file(str(calib_path))["P2"]

        # GT 3D box (초록)
        for gt in [o for o in parse_label_file(str(lbl_path)) if o.cls in ("Car","Pedestrian","Cyclist")]:
            corners_3d = compute_box_3d_fixed(gt.dims, gt.loc, gt.ry)
            if not (corners_3d[2] > 0).all(): 
                continue
            pts_2d = project_to_image(corners_3d, P2)
            img = draw_projected_box3d(img, pts_2d, (0,255,0), 2)

        # Pred 3D boxes (주황), 프레임 내 모든 ROI 예측 그리기
        for dims_pred, loc, ry in sid_to_preds.get(sid, []):
            corners_3d = compute_box_3d_fixed(dims_pred, loc, ry)
            if (corners_3d[2] <= 0).any():
                continue
            pts_2d = project_to_image(corners_3d, P2)
            img = draw_projected_box3d(img, pts_2d, (0,165,255), 2)

        cv2.imwrite(str(out_dir/f"{sid}_all.jpg"), img)
    return out_dir

# 실행: 밸리데이션에서 앞 5프레임만 예시로
sids, sid_to_preds = predict_grouped_by_sid(model, val_roi, use_bottom=True, max_frames=20)
out_dir = draw_grouped_results(val_roi, sids, sid_to_preds, out_dir="./preview_pred")
print("saved dir:", out_dir)


saved dir: preview_pred


In [31]:
import numpy as np, torch
from torch.utils.data import DataLoader
from math import atan2

@torch.no_grad()
def quick_metrics(model, dataset, max_samples=1000):
    model.eval()
    idxs = list(range(len(dataset)))[:max_samples]
    err_z, err_dims, err_yaw = [], [], []
    for i in idxs:
        item = dataset[i]
        img = item["img"].unsqueeze(0).to(device)
        dr, lz, yaw = model(img)
        prior = item["prior"].numpy()
        dims_pred = (dr.squeeze(0).cpu().numpy() + prior)  # [h,w,l]
        Z_pred = float(torch.exp(lz.squeeze()).cpu().numpy())
        yaw_vec = yaw.squeeze(0).cpu().numpy()
        ry_pred = float(atan2(yaw_vec[0], yaw_vec[1]))

        dims_t = (item["dims_res"] + item["prior"]).numpy()
        Z_t = float(torch.exp(item["logz"]).numpy())
        ry_t = float(atan2(item["yaw"][0].numpy(), item["yaw"][1].numpy()))

        err_z.append(abs(Z_pred - Z_t))
        err_dims.append(np.abs(dims_pred - dims_t).mean())
        # 각도 오차 (deg) with wrap
        d = abs((ry_pred - ry_t + np.pi)%(2*np.pi) - np.pi)
        err_yaw.append(np.degrees(d))
    return {
        "Z_MAE": float(np.mean(err_z)),
        "Dims_L1": float(np.mean(err_dims)),
        "Yaw_deg_MAE": float(np.mean(err_yaw)),
        "N": len(idxs)
    }

metrics = quick_metrics(model, val_roi, max_samples=1000)
metrics


{'Z_MAE': 2.4862086089849473,
 'Dims_L1': 0.10567159950733185,
 'Yaw_deg_MAE': 13.41263393770839,
 'N': 1000}

In [41]:
import cv2, math, numpy as np
from pathlib import Path
from math import atan, atan2

# ===== BEV 유틸 =====
def bev_canvas(W=600, H=600, bg=255):
    img = np.full((H, W, 3), bg, dtype=np.uint8)
    return img

def world_to_bev(x, z, x_range=(-20,20), z_range=(0,60), W=600, H=600):
    # x: 좌/우(카메라 좌우, +우측), z: 전방(+앞)
    # 왼쪽->오른쪽: x_range, 아래->위: z_range
    u = (x - x_range[0]) / (x_range[1]-x_range[0]) * (W-1)
    v = H-1 - (z - z_range[0]) / (z_range[1]-z_range[0]) * (H-1)
    return int(round(u)), int(round(v))

def draw_rotated_bev_rect(bev, corners_xz, color, x_range=(-20,20), z_range=(0,60)):
    pts=[]
    for X,Z in corners_xz:
        pts.append(world_to_bev(X, Z, x_range, z_range, bev.shape[1], bev.shape[0]))
    pts = np.array(pts, dtype=np.int32)
    cv2.polylines(bev, [pts], isClosed=True, color=color, thickness=2)

def get_bottom_rect_xz(dims, loc, ry):
    # compute_box_3d() 사용해서 바닥면 4코너(인덱스 0..3) 추출 (X,Z)
    corners_3d = compute_box_3d(dims, loc, ry)  # (3,8)
    # 카메라 뒤(z<=0)면 스킵용
    if (corners_3d[2] <= 0).any():
        return None
    xs = corners_3d[0, :4]; zs = corners_3d[2, :4]
    return list(zip(xs, zs))

def estimate_half_fov_x(P2, img_w):
    # tan(theta) = (u - cx) / fx ; 좌/우 끝에서의 절댓값 평균으로 근사
    fx, cx = P2[0,0], P2[0,2]
    left = abs((0 - cx)/fx)
    right = abs((img_w - cx)/fx)
    k = 0.5*(left + right)      # tan(half_fov)
    theta = atan(k)
    return theta

def draw_fov_wedge(bev, P2, img_w, x_range=(-20,20), z_range=(0,60), color=(128,128,128)):
    H, W = bev.shape[:2]
    theta = estimate_half_fov_x(P2, img_w)  # 라디안
    # 여러 z에 대해 x = ± z * tan(theta)
    for sign in (-1, 1):
        x0, z0 = 0.0, 0.0
        x1, z1 = sign * (z_range[1]) * math.tan(theta), z_range[1]
        p0 = world_to_bev(x0, z0, x_range, z_range, W, H)
        p1 = world_to_bev(x1, z1, x_range, z_range, W, H)
        cv2.line(bev, p0, p1, color, 2, cv2.LINE_AA)

    # 카메라 위치(원점) 점
    p_cam = world_to_bev(0.0, 0.0, x_range, z_range, W, H)
    cv2.circle(bev, p_cam, 4, (0,0,255), -1)

# ===== 프레임 단위 예측/그리기 =====
@torch.no_grad()
def visualize_frame_with_bev(model, dataset, sid=None,
                             x_range=(-20,20), z_range=(0,60),
                             save_dir=f"{KITTI_ROOT}/preview_combo"):
    model.eval()
    save_dir = Path(save_dir); save_dir.mkdir(parents=True, exist_ok=True)

    # 1) 어떤 프레임을 그릴지 고르기
    if sid is None:
        # val split에서 사용되는 프레임 id 목록
        val_ids = sorted({s for s,_ in dataset.items})
        sid = val_ids[0]
    # 해당 프레임의 모든 객체 인덱스 모으기
    idxs = [i for i,(s,_) in enumerate(dataset.items) if s == sid]
    assert idxs, f"{sid} 에 해당하는 ROI가 없습니다."

    # 2) 원본 이미지 & calib 로딩
    img_path = Path(TRAIN_DIR)/"image_2"/f"{sid}.png"
    lbl_path = Path(TRAIN_DIR)/"label_2"/f"{sid}.txt"
    calib = read_calib_file(str(Path(TRAIN_DIR)/"calib"/f"{sid}.txt"))
    P2 = calib["P2"]
    img = cv2.imread(str(img_path))
    H, W = img.shape[:2]

    # 3) BEV 캔버스 만들기 + FOV 쐐기선
    bev = bev_canvas(600, 600, bg=255)
    draw_fov_wedge(bev, P2, W, x_range, z_range, color=(180,180,180))

    # 4) GT 박스(초록) 그리기 (이미지+BEV)
    gts = [o for o in parse_label_file(str(lbl_path)) if o.cls in ("Car","Pedestrian","Cyclist")]
    for gt in gts:
        # 이미지(3D GT)
        corners_3d = compute_box_3d(gt.dims, gt.loc, gt.ry)
        if not (corners_3d[2] > 0).all():  # 카메라 뒤면 스킵
            continue
        pts_2d = project_to_image(corners_3d, P2)
        img = draw_projected_box3d(img, pts_2d, (0,255,0), 2)

        # BEV(바닥면)
        rect = get_bottom_rect_xz(gt.dims, gt.loc, gt.ry)
        if rect is not None:
            draw_rotated_bev_rect(bev, rect, (0,160,0), x_range, z_range)

    # 5) 예측 박스(주황) — ROI별로 예측 후 이미지+BEV에 그리기
    for i in idxs:
        item = dataset[i]
        x = item["img"].unsqueeze(0).to(device)
        dims_res, logz, yaw = model(x)

        prior = item["prior"].numpy()
        dims_pred = (dims_res.squeeze(0).cpu().numpy() + prior)   # [h,w,l]
        Z = float(torch.exp(logz.squeeze()).cpu().numpy())
        yaw_vec = yaw.squeeze(0).cpu().numpy()
        ry = float(atan2(yaw_vec[0], yaw_vec[1]))
        uv = item["uv"].numpy()
        loc = uvz_to_xyz(uv, Z, item["P2"].numpy())               # [X,Y,Z]

        # 이미지 투영(파랑/초록이랑 구분 위해 '주황-계열' 사용)
        corners_3d = compute_box_3d(dims_pred, loc, ry)
        if (corners_3d[2] <= 0).any():
            continue
        pts_2d = project_to_image(corners_3d, P2)
        img = draw_projected_box3d(img, pts_2d, (0,165,255), 2)   # BGR: 주황

        # BEV
        rect = get_bottom_rect_xz(dims_pred, loc, ry)
        if rect is not None:
            draw_rotated_bev_rect(bev, rect, (0,165,255), x_range, z_range)

    # 6) 좌우 합치기 & 저장
    #   - 이미지 높이에 맞추어 BEV 리사이즈 후 옆으로 붙임
    bev_resized = cv2.resize(bev, (img.shape[0], img.shape[0]))   # 정사각→이미지 높이에 맞춤
    combo = cv2.hconcat([img, bev_resized])
    out_path = save_dir / f"{sid}_preview_3d_bev.jpg"
    cv2.imwrite(str(out_path), combo)
    return str(out_path), sid

# 실행 예시(밸리데이션 split의 첫 프레임)
out, sid = visualize_frame_with_bev(model, val_roi, sid="000021",
                                    x_range=(-20,20), z_range=(0,60),
                                    save_dir=f"./preview_3d_bev")
print(f"[✓] saved: {out}  (sid={sid})")


[✓] saved: preview_3d_bev/000021_preview_3d_bev.jpg  (sid=000021)


# Test

In [53]:
# ==== KITTI testing: 2D detector(ROI) + 우리 3D 헤드 (이미지+BEV 시각화) ====
import cv2, math, torch, numpy as np
from pathlib import Path
from math import atan2

TEST_DIR = "/home/jinjinjara1022/AutonomousDriving/datasets/kitti_object/testing"
OUT_DIR  = "./preview_3d_bev_testing"
device = "cuda" if torch.cuda.is_available() else "cpu"

# ------------------------------------------------------------
# 필요 유틸: (셀 1~16에서 이미 정의했다면 재사용됩니다)
# - crop_resize, uvz_to_xyz, project_to_image, draw_projected_box3d
# - compute_box_3d_fixed (축 교정), bev_canvas/world_to_bev/draw_rotated_bev_rect/...
# ------------------------------------------------------------
def read_calib_file(path:str):
    data={}
    with open(path,'r') as f:
        for line in f:
            if ":" not in line: 
                continue
            k, v = line.strip().split(":", 1)
            nums = [float(x) for x in v.strip().split()] if v.strip() else []
            if k.startswith("P") and len(nums)==12:
                data[k] = np.array(nums, np.float32).reshape(3,4)
            elif k in ("R0_rect","R_rect") and len(nums)==9:
                data[k] = np.array(nums, np.float32).reshape(3,3)
            elif k=="Tr_velo_to_cam" and len(nums)==12:
                data[k] = np.array(nums, np.float32).reshape(3,4)
    return data

# 축 교정 3D 박스 (없으면 정의)
try:
    compute_box_3d_fixed
except NameError:
    def compute_box_3d_fixed(dim, loc, ry):
        h, w, l = dim
        x_c = [ w/2,  w/2, -w/2, -w/2,  w/2,  w/2, -w/2, -w/2]
        y_c = [   0,    0,    0,    0,  -h,  -h,   -h,   -h]
        z_c = [ l/2, -l/2, -l/2,  l/2,  l/2, -l/2, -l/2,  l/2]
        C = np.vstack([x_c,y_c,z_c]).astype(np.float32)
        c, s = math.cos(ry), math.sin(ry)
        R = np.array([[ c,0, s],[0,1,0],[-s,0, c]], np.float32)
        return R @ C + np.array(loc, np.float32).reshape(3,1)

# BEV 유틸 (없으면 정의)
try:
    bev_canvas
except NameError:
    def bev_canvas(W=600,H=600,bg=255): return np.full((H,W,3), bg, np.uint8)
    def world_to_bev(x,z,xr=(-20,20),zr=(0,60),W=600,H=600):
        u = (x-xr[0])/(xr[1]-xr[0])*(W-1); v = H-1 - (z-zr[0])/(zr[1]-zr[0])*(H-1)
        return int(round(u)), int(round(v))
    def draw_rotated_bev_rect(bev, corners_xz, color, x_range=(-20,20), z_range=(0,60)):
        pts=[world_to_bev(X,Z,x_range,z_range,bev.shape[1],bev.shape[0]) for X,Z in corners_xz]
        cv2.polylines(bev, [np.int32(pts)], True, color, 2)
    def get_bottom_rect_xz(dims, loc, ry):
        C = compute_box_3d_fixed(dims, loc, ry)
        if (C[2]<=0).any(): return None
        return list(zip(C[0,:4], C[2,:4]))
    def estimate_half_fov_x(P2, img_w):
        fx,cx = P2[0,0], P2[0,2]
        return math.atan(0.5*(abs((0-cx)/fx)+abs((img_w-cx)/fx)))
    def draw_fov_wedge(bev, P2, img_w, x_range=(-20,20), z_range=(0,60), color=(180,180,180)):
        H,W = bev.shape[:2]; th = estimate_half_fov_x(P2, img_w)
        p0 = world_to_bev(0,0,x_range,z_range,W,H)
        for s in (-1,1):
            p1 = world_to_bev(s*z_range[1]*math.tan(th), z_range[1], x_range,z_range,W,H)
            cv2.line(bev,p0,p1,color,2,cv2.LINE_AA)
        cv2.circle(bev, p0, 4, (0,0,255), -1)

# ------------------------------------------------------------
# 2D Detector (COCO 프리트레인) → ROI (Ped/Cyc/Car만)
# ------------------------------------------------------------
import torchvision
det2d = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT").to(device).eval()

COCO_TO_KITTI = {1:"Pedestrian", 2:"Cyclist", 3:"Car", 4:"Cyclist", 6:"Car", 8:"Car"}
class_to_idx = {"Car":0, "Pedestrian":1, "Cyclist":2}  # priors_arr 순서와 맞게 조심
# priors_arr가 이전 셀에서 계산돼 있으면 그걸 쓰고, 없으면 기본값
try:
    priors_arr
except NameError:
    priors_arr = np.stack([
        np.array([1.52,1.63,3.88],np.float32), # Car
        np.array([1.73,0.60,0.80],np.float32), # Ped
        np.array([1.73,0.60,1.76],np.float32), # Cyc
    ],0)

def run_detector_rois(img_rgb, score_thresh=0.5, max_dets=30):
    x = torch.from_numpy(img_rgb).permute(2,0,1).float()/255.0
    out = det2d([x.to(device)])[0]
    boxes = out["boxes"].detach().cpu().numpy()
    labels= out["labels"].detach().cpu().numpy()
    scores= out["scores"].detach().cpu().numpy()

    keep=[]
    for b,l,s in zip(boxes, labels, scores):
        if s < score_thresh: continue
        kcls = COCO_TO_KITTI.get(int(l))
        if kcls not in class_to_idx: continue
        l_,t_,r_,b_ = b.astype(int).tolist()
        if r_-l_ < 12 or b_-t_ < 16: continue
        keep.append({"bbox":[l_,t_,r_,b_], "kcls":kcls, "score":float(s)})
        if len(keep) >= max_dets: break
    return keep

# ------------------------------------------------------------
# testing 한 장을 "val과 동일 스타일"로 그리기 (GT 없음)
# ------------------------------------------------------------
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
IMAGENET_STD  = np.array([0.229, 0.224, 0.225], dtype=np.float32)
IMG_SIZE = 128
PADDING_SCALE = 1.2

def crop_resize(img, bbox, out_size=IMG_SIZE, scale=PADDING_SCALE):
    h, w = img.shape[:2]
    l,t,r,b = bbox
    cx, cy = (l+r)/2, (t+b)/2
    bw, bh = (r-l), (b-t)
    s = max(bw, bh) * scale
    x1, y1 = int(cx - s/2), int(cy - s/2)
    x2, y2 = int(cx + s/2), int(cy + s/2)
    # pad
    pad_l = max(0, -x1); pad_t = max(0, -y1)
    pad_r = max(0, x2 - w); pad_b = max(0, y2 - h)
    if any([pad_l,pad_t,pad_r,pad_b]):
        img = cv2.copyMakeBorder(img, pad_t, pad_b, pad_l, pad_r, cv2.BORDER_CONSTANT, value=(0,0,0))
        x1 += pad_l; x2 += pad_l; y1 += pad_t; y2 += pad_t
    crop = img[y1:y2, x1:x2]
    return cv2.resize(crop, (out_size, out_size), interpolation=cv2.INTER_LINEAR)

def uvz_to_xyz(uv, Z, P2):
    fx, fy = P2[0,0], P2[1,1]
    cx, cy = P2[0,2], P2[1,2]
    u, v = uv
    X = (u - cx) * Z / fx
    Y = (v - cy) * Z / fy
    return np.array([X, Y, Z], dtype=np.float32)

def project_to_image(pts_3d, P):
    n = pts_3d.shape[1]
    homo = np.vstack([pts_3d, np.ones((1,n), dtype=np.float32)])
    pts_2d_homo = P @ homo
    return pts_2d_homo[:2] / np.clip(pts_2d_homo[2:], 1e-6, None)

def draw_projected_box3d(img, qs, color=(0,165,255), thickness=2):
    qs = qs.T.astype(int)
    edges = [(0,1),(1,2),(2,3),(3,0),(4,5),(5,6),(6,7),(7,4),(0,4),(1,5),(2,6),(3,7)]
    for i,j in edges:
        cv2.line(img, tuple(qs[i]), tuple(qs[j]), color, thickness)
    return img

@torch.no_grad()
def visualize_test_like_val(model, sid,
                            score_thresh=0.5, max_dets=30,
                            x_range=(-20,20), z_range=(0,60),
                            out_dir=OUT_DIR):
    out_dir = Path(out_dir); out_dir.mkdir(parents=True, exist_ok=True)
    img_path  = Path(TEST_DIR)/"image_2"/f"{sid}.png"
    calib_path= Path(TEST_DIR)/"calib"/f"{sid}.txt"
    assert img_path.exists() and calib_path.exists(), f"missing files for {sid}"

    img_bgr = cv2.imread(str(img_path)); img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    H,W = img_bgr.shape[:2]
    P2  = read_calib_file(str(calib_path))["P2"]

    # 1) 2D detector → ROI (파랑)
    rois = run_detector_rois(img_rgb, score_thresh=score_thresh, max_dets=max_dets)

    # 2) BEV 캔버스 (전과 동일)
    bev = bev_canvas(600,600,255); draw_fov_wedge(bev, P2, W, x_range, z_range)

    # 3) 각 ROI를 우리 3D 헤드에 넣고 예측 → 이미지/BEV에 그림
    for r in rois:
        l,t,rgt,b = r["bbox"]
        kcls = r["kcls"]
        prior = priors_arr[class_to_idx[kcls]]

        # 파랑: 2D 박스
        cv2.rectangle(img_bgr,(l,t),(rgt,b),(255,0,0),2)

        # ROI 전처리
        crop = crop_resize(img_rgb, [l,t,rgt,b], out_size=IMG_SIZE, scale=PADDING_SCALE)
        x = torch.from_numpy(((crop/255.0 - IMAGENET_MEAN)/IMAGENET_STD)).permute(2,0,1).unsqueeze(0).float().to(device)

        # 우리 모델 추론 (dims_res, logZ, yaw(sin,cos))
        dims_res, logz, yaw = model(x)
        dims_pred = (dims_res.squeeze(0).cpu().numpy() + prior)      # [h,w,l]
        Z = float(torch.exp(logz.squeeze()).cpu().numpy())
        yv = yaw.squeeze(0).cpu().numpy()
        ry = float(atan2(yv[0], yv[1]))

        # uv: 바닥 중앙(전과 동일)
        uv = ((l+rgt)/2.0, b)
        loc = uvz_to_xyz(uv, Z, P2)

        # 이미지에 3D 박스(주황) + BEV
        C = compute_box_3d_fixed(dims_pred, loc, ry)
        if (C[2] > 0).all():
            pts = project_to_image(C, P2)
            img_bgr = draw_projected_box3d(img_bgr, pts, (0,165,255), 2)
            rect = get_bottom_rect_xz(dims_pred, loc, ry)
            if rect is not None:
                draw_rotated_bev_rect(bev, rect, (0,165,255), x_range, z_range)

    # 4) 좌/우 합치기
    bev_resized = cv2.resize(bev, (img_bgr.shape[0], img_bgr.shape[0]))
    combo = cv2.hconcat([img_bgr, bev_resized])
    out_path = Path(out_dir) / f"{sid}_testing_combo.jpg"
    cv2.imwrite(str(out_path), combo)
    return str(out_path)

# 실행 예시 (첫 프레임)
test_ids = sorted([p.stem for p in (Path(TEST_DIR)/"image_2").glob("*.png")])
print("testing frames:", len(test_ids))
p = visualize_test_like_val(model, test_ids[3], score_thresh=0.55, max_dets=25)
print("saved:", p)


testing frames: 7518
saved: preview_3d_bev_testing/000003_testing_combo.jpg
