In [None]:
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.datasets import VOCDetection
from torchvision.transforms import functional as F
from torch.utils.data import DataLoader
from torchvision.ops import box_iou
from torchvision import transforms

# Load the pretrained model
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Load the VOC2012 dataset
dataset = VOCDetection(root='.', year='2012', image_set='val', download=False)
print("dataset")

# Define the transforms
def transform(image, target):
    resize = transforms.Resize((800, 800))
    image = resize(image)
    image = F.to_tensor(image)
    
    objects = target['annotation']['object']
    if isinstance(objects, dict):
        objects = [objects]

    boxes = []
    labels = []
    temp, orig_width, orig_height = image.size()
    new_width, new_height = image.shape[2], image.shape[1]

    for obj in objects:
        xmin = float(obj['bndbox']['xmin']) * new_width / orig_width
        ymin = float(obj['bndbox']['ymin']) * new_height / orig_height
        xmax = float(obj['bndbox']['xmax']) * new_width / orig_width
        ymax = float(obj['bndbox']['ymax']) * new_height / orig_height
        boxes.append([xmin, ymin, xmax, ymax])

        # Convert labels to integers
        labels.append(int(obj['name'] == 'person'))

    boxes = torch.tensor(boxes, dtype=torch.float32)
    labels = torch.tensor(labels, dtype=torch.int64)
    target = {'boxes': boxes, 'labels': labels}
    return image, target

# Apply the transforms to the dataset
print("dataset length: ", len(dataset))

class VOC_Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, target = self.dataset[idx]
        if self.transform:
            image, target = self.transform(image, target)
        return image, target

voc_dataset = VOC_Dataset(dataset, transform=transform)

print("transformed dataset")

# Create a DataLoader
def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    images = torch.stack(images, dim=0)
    return images, targets

dataloader = DataLoader(voc_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
print("dataloader")

# Create lists to store true and predicted boxes and labels
true_boxes = []
true_labels = []
pred_boxes = []
pred_scores = []

print("dataloader length: ", len(dataloader))
with torch.no_grad():
    for images, targets in dataloader:
        outputs = model(images)
        for target, output in zip(targets, outputs):
            true_boxes.append(target['boxes'])
            true_labels.append(target['labels'])
            pred_boxes.append(output['boxes'])
            pred_scores.append(output['scores'])
    

# Compute the mAP
ap = []
print("true_boxes length:", len(true_boxes))
for true_box, true_label, pred_box, pred_score in zip(true_boxes, true_labels, pred_boxes, pred_scores):
    ious = box_iou(true_box, pred_box)
    tp = (ious.max(dim=1)[0] > 0.5) * (true_label == 1)
    fp = (ious.max(dim=1)[0] > 0.5) * (true_label == 0)
    fn = (ious.max(dim=1)[0] <= 0.5) * (true_label == 1)
    precision = tp.sum() / (tp.sum() + fp.sum() + 1e-10)
    recall = tp.sum() / (tp.sum() + fn.sum() + 1e-10)
    ap.append((2 * precision * recall) / (precision + recall + 1e-10))
    print(ap)
mAP = torch.stack(ap).mean().item()

print(f'mAP: {mAP}')


dataset
dataset length:  5823
transformed dataset
dataloader
dataloader length:  2912
