# Project 2 - Task 2.2 Notebook


In [53]:
import cv2
import sns
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import warnings
import torch
from ultralytics import YOLO
from transformers import pipeline
from torchvision.datasets import VOCDetection
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from transformers import (
    AutoImageProcessor,
    AutoModelForObjectDetection,
    RTDetrForObjectDetection, RTDetrImageProcessor,
    DetrImageProcessor,
    DetrForObjectDetection
)
import pandas as pd
import seaborn as sns

In [54]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)


set_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [55]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print("Using device", device)

Using device cuda:0


In [56]:
# Mapping between class IDs and labels

label2id = {'Plastic bag & wrapper': 0,
  'Cigarette': 1,
  'Bottle': 2,
  'Bottle cap': 3,
  'Can': 4,
  'Carton': 5}

id2label = {v: k for k, v in label2id.items()}
label2id, id2label
categories = list(id2label.values())

image_size = 480
checkpoint_rtdetr = "PekingU/rtdetr_r50vd"

In [57]:
from transformers import AutoImageProcessor

# Instantiate the image processor
image_processor = AutoImageProcessor.from_pretrained(
    checkpoint_rtdetr,
    do_resize=True,     # Resize the images to the expected size
    size={"width": image_size, "height": image_size},
    use_fast=True,      # Use the fast version of the processor
)

In [58]:
from taco_dataset import TACODETRDetectionDataset
train_dataset = TACODETRDetectionDataset(
    img_folder="/home/jb/Desktop/Projects/CV/cv-project2/taco",
    ann_file="/home/jb/Desktop/Projects/CV/cv-project2/taco/annotations_train.json",
    processor=image_processor,
)

validation_dataset = TACODETRDetectionDataset(
    img_folder="/home/jb/Desktop/Projects/CV/cv-project2/taco",
    ann_file="/home/jb/Desktop/Projects/CV/cv-project2/taco/annotations_val.json",
    processor=image_processor,
)

test_dataset = TACODETRDetectionDataset(
    img_folder="/home/jb/Desktop/Projects/CV/cv-project2/taco",
    ann_file="/home/jb/Desktop/Projects/CV/cv-project2/taco/annotations_test.json",
    processor=image_processor,
)

loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [59]:
targets_list = []
coco_ds = test_dataset.coco_dataset

print(f"Processing targets for {len(coco_ds)} images...")

for i in range(len(coco_ds)):
    img, target_raw = coco_ds[i]

    boxes = []
    labels = []

    for obj in target_raw:
        b = obj['bbox'] #[x, y, w, h]
        #XYXY
        boxes.append([b[0], b[1], b[0] + b[2], b[1] + b[3]])

        labels.append(0) # agnostic label

    targets_list.append({
        "boxes": torch.tensor(boxes, dtype=torch.float32).to(device),
        "labels": torch.tensor(labels, dtype=torch.int64).to(device)
    })

Processing targets for 178 images...


In [60]:
def apply_topk_strategy(boxes, scores, k, device):
    if k == 0 or len(scores) == 0:
        return {
            "boxes": torch.tensor([], device=device),
            "scores": torch.tensor([], device=device),
            "labels": torch.tensor([], device=device)
        }

    #Sort  by confidence
    sort_ind = torch.argsort(scores, descending=True)

    #select Top-k
    take_n = min(k, len(sort_ind))
    top_ind = sort_ind[:take_n]

    final_boxes = boxes[top_ind]
    final_scores = scores[top_ind]
    final_labels = torch.zeros(take_n, dtype=torch.int64, device=device)

    return {
        "boxes": final_boxes,
        "scores": final_scores,
        "labels": final_labels
    }

def convert_yolo_topk(yolo_results, k, device):
    boxes = yolo_results.boxes.xyxy
    scores = yolo_results.boxes.conf
    return apply_topk_strategy(boxes, scores, k, device)

def convert_transformer_topk(results, k, device):
    boxes = results["boxes"]
    scores = results["scores"]
    return apply_topk_strategy(boxes, scores, k, device)


In [61]:
from tqdm import tqdm


def evaluate_yolo_topk(model, dataset_wrapper, targets_list):
    metric = MeanAveragePrecision(box_format="xyxy", iou_type="bbox")

    print("Evaluating YOLO...")
    for i, target in enumerate(tqdm(targets_list)):
        img, _ = dataset_wrapper[i]
        k = len(target["labels"])

        results = model.predict(img, verbose=False, conf=0.001, device=device)[0]
        preds = convert_yolo_topk(results, k, device)

        preds_cpu = {k: v.cpu() for k, v in preds.items()}
        target_cpu = {k: v.cpu() for k, v in target.items()}

        metric.update([preds_cpu], [target_cpu])

    return metric.compute()

def evaluate_transformer_topk(model, processor, dataset_wrapper, targets_list):
    metric = MeanAveragePrecision(box_format="xyxy", iou_type="bbox")

    print(f"Evaluating Transformer ({type(model).__name__})...")
    for i, target in enumerate(tqdm(targets_list)):
        img, _ = dataset_wrapper[i]
        k = len(target["labels"])

        inputs = processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)

        target_sizes = torch.tensor([img.size[::-1]]).to(device)
        raw_results = processor.post_process_object_detection(
            outputs, target_sizes=target_sizes, threshold=0.001)[0]

        preds = convert_transformer_topk(raw_results, k, device)

        preds_cpu = {k: v.cpu() for k, v in preds.items()}
        target_cpu = {k: v.cpu() for k, v in target.items()}

        metric.update([preds_cpu], [target_cpu])

    return metric.compute()


In [62]:
def print_metrics(results, confidence):
    for conf in confidence:
        print(f"Confidence Threshold: {conf} \n")

        for model_name in ['yolo', 'detr', 'rt-detr']:
            result = results[model_name][conf]

            print(f"\n{model_name.upper()} Metrics:")
            print(f"  mAP :  {result['map']:.4f}")
            print(f"  mAP@50:    {result['map_50']:.4f}")
            print(f"  mAP@75 :    {result['map_75']:.4f}")
            print(f"  mAP (small objects):  {result['map_small']:.4f}")
            print(f"  mAP (medium objects): {result['map_medium']:.4f}")
            print(f"  mAP (large objects):  {result['map_large']:.4f}")
            print(f"  mAR@1:                {result['mar_1']:.4f}")
            print(f"  mAR@10:               {result['mar_10']:.4f}")
            print(f"  mAR@100:              {result['mar_100']:.4f}")
            print(f"  mAR (small objects):  {result['mar_small']:.4f}")
            print(f"  mAR (medium objects): {result['mar_medium']:.4f}")
            print(f"  mAR (large objects):  {result['mar_large']:.4f}")

In [66]:
experiments = [
    ('yolo', 'yolov8n.pt'),  # Nano
    ('yolo', 'yolov8s.pt'),  # Small
    ('yolo', 'yolov8m.pt'),  # Medium
    ('yolo', 'yolov8l.pt'),  # Large
    ('yolo', 'yolov8x.pt'),  # Extra Large
    ('yolo', 'yolo11n.pt'),
    ('yolo', 'yolo11s.pt'),
    ('yolo', 'yolo11m.pt'),
    ('yolo', 'yolo11l.pt'),
    ('yolo', 'yolo11x.pt'),
    ('detr', 'facebook/detr-resnet-50'),
    ('detr', 'facebook/detr-resnet-101'),
    ('rtdetr', 'PekingU/rtdetr_r50vd'),
    ('rtdetr', 'PekingU/rtdetr_r101vd'),
]

results_log = []

coco_ds = test_dataset.coco_dataset

for m_type, m_name in experiments:

    if m_type == 'yolo':
        model = YOLO(m_name).to(device)
        res = evaluate_yolo_topk(model, coco_ds, targets_list)

    elif m_type == 'detr':
        proc = DetrImageProcessor.from_pretrained(m_name)
        model = DetrForObjectDetection.from_pretrained(m_name).to(device)
        res = evaluate_transformer_topk(model, proc, coco_ds, targets_list)

    elif m_type == 'rtdetr':
        proc = RTDetrImageProcessor.from_pretrained(m_name)
        model = RTDetrForObjectDetection.from_pretrained(m_name).to(device)
        res = evaluate_transformer_topk(model, proc, coco_ds, targets_list)

    results_log.append({
        "Model": m_name,
        "mAP": res['map'].item(),
        "mAP_50": res['map_50'].item(),
        "mAP_75": res['map_75'].item()
    })

pd.DataFrame(results_log)


Evaluating YOLO...


100%|██████████| 178/178 [00:03<00:00, 51.39it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:03<00:00, 47.58it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:04<00:00, 36.70it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:06<00:00, 27.63it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:08<00:00, 20.49it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:03<00:00, 47.37it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:03<00:00, 45.62it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:04<00:00, 36.08it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:05<00:00, 31.47it/s]


Evaluating YOLO...


100%|██████████| 178/178 [00:07<00:00, 22.38it/s]
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Evaluating Transformer (DetrForObjectDetection)...


100%|██████████| 178/178 [00:17<00:00, 10.14it/s]
Some weights of the model checkpoint at facebook/detr-resnet-101 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Evaluating Transformer (DetrForObjectDetection)...


100%|██████████| 178/178 [00:22<00:00,  7.89it/s]


Evaluating Transformer (RTDetrForObjectDetection)...


100%|██████████| 178/178 [00:10<00:00, 16.29it/s]


Evaluating Transformer (RTDetrForObjectDetection)...


100%|██████████| 178/178 [00:14<00:00, 11.97it/s]


Unnamed: 0,Model,mAP,mAP_50,mAP_75
0,yolov8n.pt,0.054409,0.070669,0.057373
1,yolov8s.pt,0.070794,0.095179,0.073255
2,yolov8m.pt,0.091733,0.114723,0.096591
3,yolov8l.pt,0.110904,0.137525,0.114877
4,yolov8x.pt,0.100478,0.119463,0.105739
5,yolo11n.pt,0.066007,0.085729,0.073492
6,yolo11s.pt,0.089336,0.111031,0.092705
7,yolo11m.pt,0.114758,0.148934,0.117783
8,yolo11l.pt,0.091296,0.114131,0.095288
9,yolo11x.pt,0.109154,0.13987,0.108331
