In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import patches
from collections import Counter
import cv2
from glob import glob
from tqdm import tqdm
from termcolor import colored

import torch
from torch import nn, optim
from torch.utils.data import DataLoader

import torchvision
from torchvision import transforms

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [None]:
class CustomVOCDataset(torchvision.datasets.VOCDetection):
    def __init__(self, root, year="2012", image_set="train", download=False,
                 class_mapping=None, S=7, B=2, C=20, custom_transforms=None):
        """
        Initialize YOLO-specific configuration parameters.
        Args:
            root: Root directory of the VOC dataset.
            year: Dataset year (default: "2012").
            image_set: Dataset split (default: "train").
            download: Whether to download the dataset.
            class_mapping: Mapping of class names to indices.
            S: Grid size (S x S).
            B: Number of bounding boxes per grid cell.
            C: Number of classes.
            custom_transforms: Additional transformations to apply.
        """
        super(CustomVOCDataset, self).__init__(root, year, image_set, download)
        self.S = S  # Grid size
        self.B = B  # Number of bounding boxes
        self.C = C  # Number of classes
        self.class_mapping = class_mapping  # Class name-to-index mapping
        self.custom_transforms = custom_transforms  # Additional transformations

    def __getitem__(self, index):
        """
        Get an image and its target (annotations) from the VOC dataset.
        """
        # Get the image and target annotations
        image, target = super(CustomVOCDataset, self).__getitem__(index)
        img_width, img_height = image.size

        # Convert target annotations to YOLO format
        boxes = convert_to_yolo_format(target, img_width, img_height, self.class_mapping)

        # Separate box coordinates and labels
        just_boxes = boxes[:, 1:]
        labels = boxes[:, 0]

        # Apply custom transformations
        if self.custom_transforms:
            sample = {
                'image': np.array(image),
                'bboxes': just_boxes,
                'labels': labels
            }
            sample = self.custom_transforms(**sample)
            image = sample['image']
            just_boxes = sample['bboxes']
            labels = sample['labels']

        # Create an empty label matrix for YOLO ground truth
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))

        # Convert to PyTorch tensors
        boxes = torch.tensor(just_boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.float32)
        image = torch.as_tensor(image, dtype=torch.float32)

        # Populate the label matrix
        for box, class_label in zip(boxes, labels):
            x, y, width, height = box.tolist()
            class_label = int(class_label)

            # Determine grid cell
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            # Scale width and height relative to grid cell
            width_cell, height_cell = width * self.S, height * self.S

            # Check if this grid cell already has an object
            if label_matrix[i, j, 20] == 0:
                label_matrix[i, j, 20] = 1  # Object exists
                label_matrix[i, j, 21:25] = torch.tensor([x_cell, y_cell, width_cell, height_cell])  # Box coordinates
                label_matrix[i, j, class_label] = 1  # One-hot class label

        return image, label_matrix

In [None]:
def convert_to_yolo_format(target, img_width, img_height, class_mapping):
    """
    Convert annotation data from VOC format to YOLO format.

    Parameters:
        target (dict): Annotation data from VOCDetection dataset.
        img_width (int): Width of the original image.
        img_height (int): Height of the original image.
        class_mapping (dict): Mapping from class names to integer IDs.

    Returns:
        np.ndarray: Array of shape [N, 5] for N bounding boxes,
                    each with [class_id, x_center, y_center, width, height].
    """
    # Extract the list of annotations from the target dictionary.
    annotations = target['annotation']['object']

    # Get the real width and height of the image from the annotation.
    real_width = int(target['annotation']['size']['width'])
    real_height = int(target['annotation']['size']['height'])

    # Ensure annotations is a list, even if there's only one object.
    if not isinstance(annotations, list):
        annotations = [annotations]

    # Initialize an empty list to store the converted bounding boxes.
    boxes = []

    # Loop through each annotation and convert it to YOLO format.
    for anno in annotations:
        # Normalize bounding box coordinates.
        xmin = int(anno['bndbox']['xmin']) / real_width
        xmax = int(anno['bndbox']['xmax']) / real_width
        ymin = int(anno['bndbox']['ymin']) / real_height
        ymax = int(anno['bndbox']['ymax']) / real_height

        # Calculate the center coordinates, width, and height of the bounding box.
        x_center = (xmin + xmax) / 2
        y_center = (ymin + ymax) / 2
        width = xmax - xmin
        height = ymax - ymin

        # Retrieve the class name from the annotation and map it to an integer ID.
        class_name = anno['name']
        class_id = class_mapping.get(class_name, -1)  # Default to -1 if class not found

        # Append the YOLO formatted bounding box to the list.
        boxes.append([class_id, x_center, y_center, width, height])

    # Convert the list of boxes to a NumPy array.
    return np.array(boxes)

In [None]:
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculate the Intersection over Union (IoU) between bounding boxes.

    Parameters:
        boxes_preds (tensor): Predicted bounding boxes (BATCH_SIZE, 4).
        boxes_labels (tensor): Ground truth bounding boxes (BATCH_SIZE, 4).
        box_format (str): Box format, can be "midpoint" or "corners".

    Returns:
        tensor: Intersection over Union scores for each example.
    """
    if box_format == "midpoint":
        # Convert midpoint format (x_center, y_center, width, height) to corners
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2

        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    elif box_format == "corners":
        # Extract corner coordinates
        box1_x1, box1_y1, box1_x2, box1_y2 = boxes_preds[..., 0:1], boxes_preds[..., 1:2], boxes_preds[..., 2:3], boxes_preds[..., 3:4]
        box2_x1, box2_y1, box2_x2, box2_y2 = boxes_labels[..., 0:1], boxes_labels[..., 1:2], boxes_labels[..., 2:3], boxes_labels[..., 3:4]
    else:
        raise ValueError("box_format must be either 'midpoint' or 'corners'.")

    # Calculate intersection coordinates
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # Compute intersection area
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    # Compute areas of both boxes
    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    # Compute IoU
    iou = intersection / (box1_area + box2_area - intersection + 1e-6)

    return iou

In [None]:
def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
    """
    Perform Non-Maximum Suppression (NMS) on a list of bounding boxes.

    Parameters:
        bboxes (list): List of bounding boxes, each represented as
                       [class_pred, prob_score, x1, y1, x2, y2].
        iou_threshold (float): IoU threshold to determine overlapping boxes.
        threshold (float): Probability threshold to discard low-confidence boxes.
        box_format (str): "midpoint" or "corners" to specify the format of bounding boxes.

    Returns:
        list: List of bounding boxes after performing NMS.
    """
    # Ensure the input is a list
    assert isinstance(bboxes, list), "bboxes must be a list."

    # Filter boxes based on the probability threshold
    bboxes = [box for box in bboxes if box[1] > threshold]

    # Sort boxes by probability score in descending order
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)

    bboxes_after_nms = []

    while bboxes:
        # Select the box with the highest probability
        chosen_box = bboxes.pop(0)

        # Filter out boxes with IoU greater than the threshold with the chosen box
        bboxes = [
            box for box in bboxes
            if box[0] != chosen_box[0] or
               intersection_over_union(
                   torch.tensor(chosen_box[2:]),
                   torch.tensor(box[2:]),
                   box_format=box_format
               ) < iou_threshold
        ]

        # Append the chosen box to the result list
        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms

In [None]:
def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
):
    """
    Calculate the mean average precision (mAP).

    Parameters:
        pred_boxes (list): Predicted bounding boxes, each defined as
            [train_idx, class_pred, prob_score, x1, y1, x2, y2].
        true_boxes (list): Ground truth bounding boxes, similar format to pred_boxes.
        iou_threshold (float): IoU threshold for considering a prediction correct.
        box_format (str): "midpoint" or "corners" for bounding box format.
        num_classes (int): Number of classes.

    Returns:
        float: The mean average precision (mAP) value across all classes.
    """
    # List to store the average precision for each class
    average_precisions = []
    epsilon = 1e-6  # Small value to prevent division by zero

    for c in range(num_classes):
        # Get predictions and ground truths for the current class
        detections = [d for d in pred_boxes if d[1] == c]
        ground_truths = [gt for gt in true_boxes if gt[1] == c]

        # Count the number of ground truth boxes for each image
        amount_bboxes = Counter([gt[0] for gt in ground_truths])
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # Sort detections by confidence score in descending order
        detections.sort(key=lambda x: x[2], reverse=True)

        # Initialize true positives (TP) and false positives (FP)
        TP = torch.zeros(len(detections))
        FP = torch.zeros(len(detections))

        # Total ground truth boxes for the current class
        total_true_bboxes = len(ground_truths)

        # Skip class if no ground truth boxes exist
        if total_true_bboxes == 0:
            continue

        # Process each detection
        for detection_idx, detection in enumerate(detections):
            # Get ground truths for the same image
            ground_truth_img = [gt for gt in ground_truths if gt[0] == detection[0]]

            best_iou = 0
            best_gt_idx = -1

            # Calculate IoU for the detection with all ground truth boxes in the image
            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                )
                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            # Determine if the detection is a true positive or false positive
            if best_iou > iou_threshold:
                # Mark the ground truth box as matched
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1
            else:
                FP[detection_idx] = 1

        # Cumulative sums of TP and FP
        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)

        # Calculate recall and precision
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)

        # Add starting points for precision-recall curve
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))

        # Compute average precision using the area under the precision-recall curve
        average_precisions.append(torch.trapz(precisions, recalls))

    # Return the mean of the average precisions across all classes
    return sum(average_precisions) / len(average_precisions)

In [None]:
"""
Information about the architectural configuration:
- Each tuple represents (kernel_size, number_of_filters, stride, padding).
- "M" indicates a max-pooling layer with a 2x2 pool size and stride.
- Nested lists represent repeated blocks, where the third element specifies the number of repetitions.
"""

# Architecture configuration for YOLOv1
architecture_config = [
    (7, 64, 2, 3),              # Conv Block 1
    "M",                        # Max Pooling
    (3, 192, 1, 1),             # Conv Block 2
    "M",                        # Max Pooling
    (1, 128, 1, 0),             # Conv Block 3
    (3, 256, 1, 1),             # Conv Block 4
    (1, 256, 1, 0),             # Conv Block 5
    (3, 512, 1, 1),             # Conv Block 6
    "M",                        # Max Pooling
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],  # Conv Block 7 (repeated 4 times)
    (1, 512, 1, 0),             # Conv Block 8
    (3, 1024, 1, 1),            # Conv Block 9
    "M",                        # Max Pooling
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],  # Conv Block 10 (repeated 2 times)
    (3, 1024, 1, 1),            # Conv Block 11
    (3, 1024, 2, 1),            # Conv Block 12
    (3, 1024, 1, 1),            # Conv Block 13
    (3, 1024, 1, 1),            # Conv Block 14
]

In [None]:
class CNNBlock(nn.Module):
    """
    A convolutional block consisting of:
    - Conv2d layer
    - BatchNorm2d
    - LeakyReLU activation
    """
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

In [None]:
class Yolov1(nn.Module):
    """
    YOLOv1 Model:
    - Combines convolutional layers for feature extraction and fully connected layers for bounding box prediction.
    """
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        """
        Create convolutional layers based on the architecture configuration.
        """
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers.append(
                    CNNBlock(
                        in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3]
                    )
                )
                in_channels = x[1]

            elif type(x) == str:
                layers.append(nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)))

            elif type(x) == list:
                conv1, conv2, num_repeats = x
                for _ in range(num_repeats):
                    layers.append(
                        CNNBlock(
                            in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3]
                        )
                    )
                    layers.append(
                        CNNBlock(
                            conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3]
                        )
                    )
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        """
        Create fully connected layers for bounding box prediction.

        Parameters:
            split_size: Grid size (e.g., 7x7 for YOLOv1).
            num_boxes: Number of bounding boxes per grid cell.
            num_classes: Number of object classes.
        """
        S, B, C = split_size, num_boxes, num_classes
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S * S * (C + B * 5)),  # Output layer
        )

In [None]:
class YoloLoss(nn.Module):
    """
    Calculate the YOLO (v1) loss, combining components for box coordinates, object confidence,
    no-object confidence, and class probabilities.
    """

    def __init__(self, S=7, B=2, C=20):
        """
        Initialize the YOLO loss.

        Parameters:
            S (int): Grid size (e.g., 7 for 7x7 grid).
            B (int): Number of bounding boxes per grid cell.
            C (int): Number of classes.
        """
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C
        self.lambda_noobj = 0.5  # Weight for no-object loss
        self.lambda_coord = 5  # Weight for box coordinate loss

    def forward(self, predictions, target):
        """
        Compute the YOLO loss.

        Parameters:
            predictions (tensor): Predicted outputs (BATCH_SIZE, S * S * (C + B * 5)).
            target (tensor): Ground truth tensor of the same shape as predictions.

        Returns:
            loss (tensor): Computed YOLO loss.
        """
        # Reshape predictions to (BATCH_SIZE, S, S, C + B * 5)
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)

        # Extract IoU for both bounding boxes
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

        # Determine which box has the highest IoU
        iou_maxes, bestbox = torch.max(ious, dim=0)
        exists_box = target[..., 20].unsqueeze(3)  # Iobj_i in the paper

        # ======================== #
        #   FOR BOX COORDINATES    #
        # ======================== #
        box_predictions = exists_box * (
            bestbox * predictions[..., 26:30] + (1 - bestbox) * predictions[..., 21:25]
        )
        box_targets = exists_box * target[..., 21:25]

        # Take the square root of width and height to ensure positive values
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        # Compute box loss
        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )

        # ==================== #
        #    FOR OBJECT LOSS   #
        # ==================== #
        pred_box = (
            bestbox * predictions[..., 25:26] + (1 - bestbox) * predictions[..., 20:21]
        )
        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21]),
        )

        # ======================= #
        #  FOR NO OBJECT LOSS     #
        # ======================= #
        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )
        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )

        # ================== #
        #   FOR CLASS LOSS   #
        # ================== #
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
            torch.flatten(exists_box * target[..., :20], end_dim=-2),
        )

        # ================== #
        #   FINAL LOSS       #
        # ================== #
        loss = (
            self.lambda_coord * box_loss  # Box coordinates loss
            + object_loss  # Object confidence loss
            + self.lambda_noobj * no_object_loss  # No-object confidence loss
            + class_loss  # Class probability loss
        )

        return loss

In [None]:
# Set the random seed for reproducibility
seed = 123
torch.manual_seed(seed)

<torch._C.Generator at 0x7f30ab2874f0>

In [None]:
# Hyperparameters
LEARNING_RATE = 2e-5
DEVICE = "cuda"  # Use "cuda" for GPU or "cpu" for CPU
BATCH_SIZE = 16  # Originally 64 in the YOLO paper, reduced for smaller GPUs
EPOCHS = 300  # Number of training epochs
NUM_WORKERS = 2  # Number of worker processes for data loading
PIN_MEMORY = True  # Pin memory for faster GPU transfers
LOAD_MODEL = False  # Whether to load a pre-trained model
LOAD_MODEL_FILE = "yolov1.pth.tar"  # File name for the pre-trained model (if LOAD_MODEL=True)

In [None]:
# Input image dimensions
WIDTH = 448
HEIGHT = 448

def get_train_transforms():
    """
    Get data augmentation transformations for training.
    """
    return A.Compose(
        [
            A.OneOf(
                [
                    A.HueSaturationValue(
                        hue_shift_limit=0.2,
                        sat_shift_limit=0.2,
                        val_shift_limit=0.2,
                        p=0.9,
                    ),
                    A.RandomBrightnessContrast(
                        brightness_limit=0.2,
                        contrast_limit=0.2,
                        p=0.9,
                    ),
                ],
                p=0.9,
            ),
            A.ToGray(p=0.01),  # Randomly convert some images to grayscale
            A.HorizontalFlip(p=0.2),  # Random horizontal flip
            A.VerticalFlip(p=0.2),  # Random vertical flip
            A.Resize(height=HEIGHT, width=WIDTH, p=1.0),  # Resize images
            A.Cutout(
                num_holes=8,
                max_h_size=64,
                max_w_size=64,
                fill_value=0,
                p=0.5,
            ),  # Random cutout augmentation
            ToTensorV2(p=1.0),  # Convert to PyTorch tensors
        ],
        p=1.0,
        bbox_params=A.BboxParams(
            format="yolo",
            min_area=0,
            min_visibility=0,
            label_fields=["labels"],
        ),
    )

In [None]:
def get_valid_transforms():
    """
    Get data transformations for validation (no augmentation).
    """
    return A.Compose(
        [
            A.Resize(height=HEIGHT, width=WIDTH, p=1.0),  # Resize images
            ToTensorV2(p=1.0),  # Convert to PyTorch tensors
        ],
        p=1.0,
        bbox_params=A.BboxParams(
            format="yolo",
            min_area=0,
            min_visibility=0,
            label_fields=["labels"],
        ),
    )

In [None]:
# Class mapping for VOC dataset
class_mapping = {
    "aeroplane": 0,
    "bicycle": 1,
    "bird": 2,
    "boat": 3,
    "bottle": 4,
    "bus": 5,
    "car": 6,
    "cat": 7,
    "chair": 8,
    "cow": 9,
    "diningtable": 10,
    "dog": 11,
    "horse": 12,
    "motorbike": 13,
    "person": 14,
    "pottedplant": 15,
    "sheep": 16,
    "sofa": 17,
    "train": 18,
    "tvmonitor": 19,
}

In [None]:
def train_fn(train_loader, model, optimizer, loss_fn, epoch):
    """
    Training loop for one epoch.

    Parameters:
        train_loader (DataLoader): DataLoader for training data.
        model (nn.Module): YOLOv1 model.
        optimizer (torch.optim.Optimizer): Optimizer for training.
        loss_fn (nn.Module): Loss function (YoloLoss).
        epoch (int): Current epoch number.

    Returns:
        avg_mAP (float): Average mean Average Precision (mAP) for the epoch.
    """
    model.train()  # Set model to training mode
    mean_loss = []
    mean_mAP = []

    total_batches = len(train_loader)
    display_interval = total_batches // 5  # Display progress 5 times per epoch

    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(DEVICE), y.to(DEVICE)

        # Forward pass
        out = model(x)

        # Compute loss
        loss = loss_fn(out, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Compute predictions and mAP
        pred_boxes, true_boxes = get_bboxes_training(
            out, y, iou_threshold=0.5, threshold=0.4
        )
        mAP = mean_average_precision(
            pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint"
        )

        # Track metrics
        mean_loss.append(loss.item())
        mean_mAP.append(mAP.item())

        # Display progress
        if batch_idx % display_interval == 0 or batch_idx == total_batches - 1:
            print(
                f"Epoch: {epoch:3} \t Iter: {batch_idx:3}/{total_batches:3} "
                f"\t Loss: {loss.item():.10f} \t mAP: {mAP.item():.10f}"
            )

    # Compute average metrics
    avg_loss = sum(mean_loss) / len(mean_loss)
    avg_mAP = sum(mean_mAP) / len(mean_mAP)

    print(colored(f"Train \t Loss: {avg_loss:.10f} \t mAP: {avg_mAP:.10f}", "green"))

    return avg_mAP

In [None]:
def test_fn(test_loader, model, loss_fn, epoch):
    """
    Testing loop for one epoch.

    Parameters:
        test_loader (DataLoader): DataLoader for test/validation data.
        model (nn.Module): YOLOv1 model.
        loss_fn (nn.Module): Loss function (YoloLoss).
        epoch (int): Current epoch number.

    Returns:
        avg_mAP (float): Average mean Average Precision (mAP) for the epoch.
    """
    model.eval()  # Set model to evaluation mode
    mean_loss = []
    mean_mAP = []

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(test_loader):
            x, y = x.to(DEVICE), y.to(DEVICE)

            # Forward pass
            out = model(x)

            # Compute loss
            loss = loss_fn(out, y)

            # Compute predictions and mAP
            pred_boxes, true_boxes = get_bboxes_training(
                out, y, iou_threshold=0.5, threshold=0.4
            )
            mAP = mean_average_precision(
                pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint"
            )

            # Track metrics
            mean_loss.append(loss.item())
            mean_mAP.append(mAP.item())

    # Compute average metrics
    avg_loss = sum(mean_loss) / len(mean_loss)
    avg_mAP = sum(mean_mAP) / len(mean_mAP)

    print(colored(f"Test \t Loss: {avg_loss:.10f} \t mAP: {avg_mAP:.10f}", "yellow"))

    return avg_mAP

In [None]:
def plot_image_with_labels(image, ground_truth_boxes, predicted_boxes, class_mapping):
    """
    Draw ground truth and predicted bounding boxes on an image with labels.

    Parameters:
        image (numpy.ndarray): The input image.
        ground_truth_boxes (list): Ground truth bounding boxes, each formatted as [class_id, x_center, y_center, width, height].
        predicted_boxes (list): Predicted bounding boxes, similar format to ground_truth_boxes.
        class_mapping (dict): Mapping of class IDs to class names.
    """
    # Invert the class mapping for quick lookup
    inverted_class_mapping = {v: k for k, v in class_mapping.items()}

    # Convert the image to a numpy array and get its dimensions
    im = np.array(image)
    height, width, _ = im.shape

    # Set up the plot
    fig, ax = plt.subplots(1)
    ax.imshow(im)

    # Draw ground truth boxes (green)
    for box in ground_truth_boxes:
        label_index, box = box[0], box[1:]
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=1,
            edgecolor="green",
            facecolor="none",
        )
        ax.add_patch(rect)
        class_name = inverted_class_mapping.get(label_index, "Unknown")
        ax.text(
            upper_left_x * width,
            upper_left_y * height,
            class_name,
            color="white",
            fontsize=12,
            bbox=dict(facecolor="green", alpha=0.5),
        )

    # Draw predicted boxes (red)
    for box in predicted_boxes:
        label_index, box = box[0], box[1:]
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=1,
            edgecolor="red",
            facecolor="none",
        )
        ax.add_patch(rect)
        class_name = inverted_class_mapping.get(label_index, "Unknown")
        ax.text(
            upper_left_x * width,
            upper_left_y * height,
            class_name,
            color="white",
            fontsize=12,
            bbox=dict(facecolor="red", alpha=0.5),
        )

    plt.show()

In [None]:
def test():
    """
    Test the YOLOv1 model on the validation dataset and visualize results.
    """
    # Initialize the YOLO model
    model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)

    # Load the model's weights if specified
    if LOAD_MODEL:
        model.load_state_dict(torch.load(LOAD_MODEL_FILE)["state_dict"])

    # Prepare the test dataset and DataLoader
    test_dataset = CustomVOCDataset(
        root="./data", image_set="val", download=False
    )
    test_dataset.init_config_yolo(
        class_mapping=class_mapping,
        custom_transforms=get_valid_transforms(),
    )
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=False,
        drop_last=False,
    )

    model.eval()

    # Iterate over the test dataset
    for x, y in test_loader:
        x = x.to(DEVICE)
        out = model(x)

        # Convert predictions and ground truth to bounding boxes
        pred_bboxes = cellboxes_to_boxes(out)
        gt_bboxes = cellboxes_to_boxes(y)

        # Visualize the first 8 images with their bounding boxes
        for idx in range(8):
            pred_box = non_max_suppression(
                pred_bboxes[idx],
                iou_threshold=0.5,
                threshold=0.4,
                box_format="midpoint",
            )
            gt_box = non_max_suppression(
                gt_bboxes[idx],
                iou_threshold=0.5,
                threshold=0.4,
                box_format="midpoint",
            )
            image = x[idx].permute(1, 2, 0).cpu().numpy() / 255.0
            plot_image_with_labels(image, gt_box, pred_box, class_mapping)
        break  # Stop after processing the first batch