In [17]:
import torch
from torch.utils.data import Dataset
import os
import cv2
import json
from PIL import Image

class BDDDetectionDataset(Dataset):
    def __init__(self, image_dir, annotation_file, category_map=None, transforms=None, evaluation=False):
        self.image_dir = image_dir
        self.transforms = transforms
        self.evaluation = evaluation

        with open(annotation_file) as f:
            self.annotations = json.load(f)

        self.category_map = category_map or self._generate_category_map()
        self.image_annotations = self._organize_annotations()

    def _generate_category_map(self):
        # Extract all categories from the annotation file and assign integer labels
        categories = set()
        for item in self.annotations:
            for label in item.get('labels', []):
                categories.add(label['category'])
        categories = sorted(list(categories))
        categories.remove("lane")
        categories.remove("drivable area")
        return {cat: idx + 1 for idx, cat in enumerate(categories)}  # +1 because 0 is background

    def _organize_annotations(self):
        image_annots = []
        for item in self.annotations:
            filename = item['name']
            labels = item.get('labels', [])

            boxes = []
            labels_idx = []

            for label in labels:
                if 'box2d' not in label:
                    continue
                box = label['box2d']
                x1, y1, x2, y2 = box['x1'], box['y1'], box['x2'], box['y2']
                
                if x2 <= x1 or y2 <= y1:
                    continue
                boxes.append([x1, y1, x2, y2])
                labels_idx.append(self.category_map[label['category']])
            
            # Skip image with no valid boxes
            if len(boxes) == 0:
                continue  
                
            image_annots.append({
                'filename': filename,
                'boxes': boxes,
                'labels': labels_idx
            })

        return image_annots

    def __getitem__(self, idx):
        data = self.image_annotations[idx]
        img_path = os.path.join(self.image_dir, data['filename'])



        img = Image.open(img_path).convert("RGB")

        boxes = torch.tensor(data['boxes'], dtype=torch.float32)
        labels = torch.tensor(data['labels'], dtype=torch.int64)
        target = {
            'boxes': boxes,
            'labels': labels,
        }
        if self.evaluation:
            target['image_id'] = data['filename']

        if self.transforms:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.image_annotations)

    def get_category_map(self):
        return self.category_map

In [19]:
from torchvision import transforms
from torch.utils.data import DataLoader

transform_list = transforms.Compose([
    transforms.ToTensor()
])

train_image_dir = "/home/kkp3kor/2025/bdk_object_detection/data/bdd100k_images_100k/bdd100k/images/100k/train"
train_label_json = "/home/kkp3kor/2025/bdk_object_detection/data/bdd100k_labels_release/bdd100k/labels/bdd100k_labels_images_train.json"
train_dataset = BDDDetectionDataset(
    train_image_dir, 
    train_label_json, 
    category_map=None, 
    transforms=transform_list,
    evaluation=False)
train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [20]:
train_dataset.category_map

{'bike': 1,
 'bus': 2,
 'car': 3,
 'motor': 4,
 'person': 5,
 'rider': 6,
 'traffic light': 7,
 'traffic sign': 8,
 'train': 9,
 'truck': 10}

In [21]:
valid_image_dir = "/home/kkp3kor/2025/bdk_object_detection/data/bdd100k_images_100k/bdd100k/images/100k/val"
valid_label_json = "/home/kkp3kor/2025/bdk_object_detection/data/bdd100k_labels_release/bdd100k/labels/bdd100k_labels_images_val.json"
valid_dataset = BDDDetectionDataset(
    valid_image_dir, 
    valid_label_json, 
    category_map=train_dataset.category_map, 
    transforms=transform_list,
    evaluation=True)
valid_data_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [22]:
num_classes = len(train_dataset.category_map) + 1  # +1 for background class
num_classes

11

In [23]:
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torch

model = fasterrcnn_resnet50_fpn(pretrained="/home/kkp3kor/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth")
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

In [25]:
device

device(type='cuda')

In [26]:
import torch
import os

def train_model(model, train_loader, val_loader, optimizer, device, num_epochs=100, save_path="model/best_model.pth"):
    best_val_loss = float('inf')
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for images, targets in train_loader:
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            train_loss += losses.item()

        avg_train_loss = train_loss / len(train_loader)

        # ------------------ Validation ------------------
        val_loss = 0.0
        with torch.no_grad():
            for images, targets in val_loader:
                images = [img.to(device) for img in images]
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

                loss_dict = model(images, targets)
                losses = sum(loss for loss in loss_dict.values())
                val_loss += losses.item()

        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            torch.save(model.state_dict(), save_path)
            print(f"✅ Saved best model at epoch {epoch+1} with val loss {best_val_loss:.4f}")

In [None]:
train_model(model, train_data_loader, valid_data_loader, optimizer, device, num_epochs=100, save_path="model/best_model.pth")

Epoch [1/100] - Train Loss: 0.8761 | Val Loss: 0.8395
✅ Saved best model at epoch 1 with val loss 0.8395
Epoch [2/100] - Train Loss: 0.8141 | Val Loss: 0.8161
✅ Saved best model at epoch 2 with val loss 0.8161
Epoch [3/100] - Train Loss: 0.7982 | Val Loss: 0.8084
✅ Saved best model at epoch 3 with val loss 0.8084
Epoch [4/100] - Train Loss: 0.7887 | Val Loss: 0.8097
Epoch [5/100] - Train Loss: 0.7827 | Val Loss: 0.8093
Epoch [6/100] - Train Loss: 0.7786 | Val Loss: 0.8119


In [27]:
model.load_state_dict(torch.load("model/best_model.pth"))
import os
from tqdm import tqdm

def run_inference(model, dataloader, device, score_thresh=0.4, save_dir="outputs/vis"):
    model.to(device)
    model.eval()
    os.makedirs(save_dir, exist_ok=True)
    predictions = []
    i = 0
    for images, targets in tqdm(dataloader):
        images = [img.to(device) for img in images]
        with torch.no_grad():
            outputs = model(images)

        for image, output, target in zip(images, outputs, targets):
            keep = output["scores"] > score_thresh
            pred_boxes = output["boxes"][keep].cpu()
            pred_labels = output["labels"][keep].cpu()
            pred_scores = output["scores"][keep].cpu()

            predictions.append({
                "filename": target["image_id"],
                "pred_boxes": pred_boxes,
                "pred_scores": pred_scores,
                "gt_boxes": target["boxes"],
                "pred_labels": pred_labels,
                "gt_labels": target["labels"]
            })
        if i == 400:
            break
        i += 1
    return predictions

In [28]:
predictions = run_inference(model, valid_data_loader, device)

 64%|██████▍   | 400/625 [02:17<01:17,  2.90it/s]


In [29]:
from torchvision.ops import box_iou

def compute_eval_metrics(predictions, iou_thresh=0.5):
    TP, FP, FN = 0, 0, 0
    for pred in predictions:
        preds = pred["pred_boxes"]
        gts = pred["gt_boxes"]
        if len(preds) == 0 and len(gts) == 0:
            continue
        elif len(preds) == 0:
            FN += len(gts)
            continue
        elif len(gts) == 0:
            FP += len(preds)
            continue
        ious = box_iou(preds, gts)
        matched = (ious.max(dim=1).values > iou_thresh).sum().item()
        TP += matched
        FP += len(preds) - matched
        FN += len(gts) - matched

    precision = TP / (TP + FP + 1e-6)
    recall = TP / (TP + FN + 1e-6)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-6)
    return precision, recall, f1


import json
import os

def save_predictions_to_json(predictions, save_path="outputs/predictions.json"):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    def convert(obj):
        if isinstance(obj, torch.Tensor):
            return obj.tolist()
        elif isinstance(obj, dict):
            return {k: convert(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert(v) for v in obj]
        else:
            return obj

    clean_preds = convert(predictions)

    with open(save_path, "w") as f:
        json.dump(clean_preds, f, indent=2)

import json
import torch

def load_predictions_from_json(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    for pred in data:
        # Convert lists back to tensors
        pred["pred_boxes"] = torch.tensor(pred["pred_boxes"], dtype=torch.float32)
        pred["pred_labels"] = torch.tensor(pred["pred_labels"], dtype=torch.int64)
        pred["gt_boxes"] = torch.tensor(pred["gt_boxes"], dtype=torch.float32)
        pred["gt_labels"] = torch.tensor(pred["gt_labels"], dtype=torch.int64)
        pred["pred_scores"] = torch.tensor(pred["pred_scores"], dtype=torch.int64)

    return data

In [30]:
#predictions = run_inference(model, valid_data_loader, device)
precision, recall, f1 = compute_eval_metrics(predictions)
print(f"Precision: {precision:.3f} | Recall: {recall:.3f} | F1 Score: {f1:.3f}")
save_predictions_to_json(predictions, "outputs/predictions.json")
pred_data = load_predictions_from_json("outputs/predictions.json")
precision, recall, f1 = compute_eval_metrics(pred_data)
print(f"Precision: {precision:.3f} | Recall: {recall:.3f} | F1 Score: {f1:.3f}")

Precision: 0.634 | Recall: 0.797 | F1 Score: 0.706
Precision: 0.634 | Recall: 0.797 | F1 Score: 0.706


In [31]:
from torchvision.utils import draw_bounding_boxes
import torchvision.transforms.functional as F
import matplotlib.pyplot as plt

def visualize_sample(image_tensor, gt_boxes, pred_boxes, pred_labels, save_path):
    img = (image_tensor * 255).byte().clone()
    img = draw_bounding_boxes(img, gt_boxes, colors="green", labels=["GT"] * len(gt_boxes))
    img = draw_bounding_boxes(img, pred_boxes, colors="red", labels=["PRED"] * len(pred_boxes))
    img = F.to_pil_image(img)
    img.save(save_path)
    
i = 0
for pred in predictions:
    img_path = os.path.join(valid_image_dir, pred["filename"])
    if os.path.exists(img_path):
        image = Image.open(img_path).convert("RGB")
        image_tensor = F.to_tensor(image)
        visualize_sample(image_tensor, pred["gt_boxes"], pred["pred_boxes"], pred["pred_labels"],
                         f"outputs/vis/{pred['filename']}")
    if i == 10:
        break
    i += 1

In [32]:
from sklearn.metrics import average_precision_score
import torch
import numpy as np
from collections import defaultdict
from torchvision.ops import box_iou

def compute_classwise_ap(predictions, num_classes, iou_thresh=0.5):
    """
    Args:
        predictions: list of dicts with keys: pred_boxes, pred_labels, pred_scores, gt_boxes, gt_labels
        num_classes: total number of classes (excluding background=0)
        iou_thresh: IoU threshold for positive match
    Returns:
        ap_per_class: dict with class_id -> AP
        mAP: mean AP across classes
    """

    # Collect all predictions and ground truth info
    pred_by_class = defaultdict(list)
    gt_by_class = defaultdict(list)

    for pred in predictions:
        pred_boxes = pred["pred_boxes"]
        pred_labels = pred["pred_labels"]
        pred_scores = pred["pred_scores"]
        gt_boxes = pred["gt_boxes"]
        gt_labels = pred["gt_labels"]

        for c in range(1, num_classes + 1):
            # Filter for current class
            cls_pred_mask = pred_labels == c
            cls_gt_mask = gt_labels == c

            cls_pred_boxes = pred_boxes[cls_pred_mask]
            cls_pred_scores = pred_scores[cls_pred_mask]
            cls_gt_boxes = gt_boxes[cls_gt_mask]

            matched = torch.zeros(len(cls_gt_boxes))  # track matched GT boxes
            tp = torch.zeros(len(cls_pred_boxes))
            fp = torch.zeros(len(cls_pred_boxes))

            for i, pbox in enumerate(cls_pred_boxes):
                if len(cls_gt_boxes) == 0:
                    fp[i] = 1
                    continue
                ious = box_iou(pbox.unsqueeze(0), cls_gt_boxes)[0]
                max_iou, max_idx = ious.max(0)

                if max_iou >= iou_thresh and matched[max_idx] == 0:
                    tp[i] = 1
                    matched[max_idx] = 1
                else:
                    fp[i] = 1

            # Store per-class results
            pred_by_class[c].extend(zip(cls_pred_scores.tolist(), tp.tolist(), fp.tolist()))
            gt_by_class[c].append(len(cls_gt_boxes))

    # Compute precision-recall curve & AP for each class
    ap_per_class = {}
    for c in range(1, num_classes + 1):
        if len(pred_by_class[c]) == 0:
            ap_per_class[c] = 0.0
            continue

        pred_by_class[c].sort(key=lambda x: -x[0])  # sort by score descending
        scores, tps, fps = zip(*pred_by_class[c])

        scores = np.array(scores)
        tps = np.array(tps)
        fps = np.array(fps)

        tps_cum = np.cumsum(tps)
        fps_cum = np.cumsum(fps)
        recalls = tps_cum / (sum(gt_by_class[c]) + 1e-6)
        precisions = tps_cum / (tps_cum + fps_cum + 1e-6)

        # Interpolate precision
        ap = 0.0
        for r in np.linspace(0, 1, 11):
            p = precisions[recalls >= r].max() if np.any(recalls >= r) else 0
            ap += p / 11.0
        ap_per_class[c] = round(ap, 4)

    mAP = round(np.mean(list(ap_per_class.values())), 4)
    return ap_per_class, mAP

In [33]:
num_classes = len(train_dataset.category_map)  # e.g., 10
ap_per_class, mean_ap = compute_classwise_ap(predictions, num_classes, iou_thresh=0.4)

print(f"\n✅ mAP@[IoU=0.4]: {mean_ap:.4f}")
print("📊 Class-wise AP:")
for class_id, ap in ap_per_class.items():
    class_name = [k for k, v in train_dataset.category_map.items() if v == class_id][0]
    print(f"{class_name:20}: {ap:.4f}")


✅ mAP@[IoU=0.4]: 0.4770
📊 Class-wise AP:
bike                : 0.3993
bus                 : 0.4823
car                 : 0.7835
motor               : 0.3261
person              : 0.5886
rider               : 0.3483
traffic light       : 0.6685
traffic sign        : 0.6375
train               : 0.0000
truck               : 0.5357


In [34]:
ap_per_class, mean_ap = compute_classwise_ap(predictions, num_classes, iou_thresh=0.5)

print(f"\n✅ mAP@[IoU=0.5]: {mean_ap:.4f}")
print("📊 Class-wise AP:")
for class_id, ap in ap_per_class.items():
    class_name = [k for k, v in train_dataset.category_map.items() if v == class_id][0]
    print(f"{class_name:20}: {ap:.4f}")


✅ mAP@[IoU=0.5]: 0.4395
📊 Class-wise AP:
bike                : 0.3797
bus                 : 0.4807
car                 : 0.7011
motor               : 0.3217
person              : 0.5103
rider               : 0.3420
traffic light       : 0.5576
traffic sign        : 0.6226
train               : 0.0000
truck               : 0.4797


In [35]:
ap_per_class, mean_ap = compute_classwise_ap(predictions, num_classes, iou_thresh=0.6)

print(f"\n✅ mAP@[IoU=0.5]: {mean_ap:.4f}")
print("📊 Class-wise AP:")
for class_id, ap in ap_per_class.items():
    class_name = [k for k, v in train_dataset.category_map.items() if v == class_id][0]
    print(f"{class_name:20}: {ap:.4f}")


✅ mAP@[IoU=0.5]: 0.3861
📊 Class-wise AP:
bike                : 0.2900
bus                 : 0.4221
car                 : 0.6673
motor               : 0.2440
person              : 0.4793
rider               : 0.3331
traffic light       : 0.4101
traffic sign        : 0.5445
train               : 0.0000
truck               : 0.4702
