# Object detection with YOLOS

In this notebook we will apply YOLOS for object detection, implement NMS, and visualize results.

The original paper (NeurIPS 2021): https://arxiv.org/pdf/2106.00666

**Goal.** The goal of this notebook is to develop the basic skills in working with detection models and bounding boxes.

You need the following extra libraries beyond PyTorch:
* transformers
* Pillow (PIL)

In [None]:
import torch
import matplotlib.patches as patches
from transformers import AutoImageProcessor, AutoModelForObjectDetection
from transformers.models.yolos.image_processing_yolos import center_to_corners_format
from PIL import Image
from matplotlib import pyplot as plt

CONFIDENCE_THRESHOLD = 0.5
IOU_THRESHOLD = 0.5

# Helper tools. You can skip this block.

def check_batch_iou(fn):
    reference = torch.tensor([0, 0, 4, 6])
    boxes = torch.tensor([[0, 0, 4, 4],
                          [-2, -2, 2, 2],
                          [-4, -4, 0, 0],
                          [-6, -6, -2, -2]])
    iou_gt = torch.tensor([2 / 3, 1 / 9, 0, 0])
    result = torch.as_tensor(fn(reference, boxes))
    if result.shape != (4,):
        print("Wrong output size")
        return False
    for i, box in enumerate(boxes):
        if abs(result[i] - iou_gt[i]) > 1e-6:
            print("Wrong IoU")
            print("Reference:", reference)
            print("Box:", box)
            print("Answer:", result[i])
            print("True answer:", iou_gt[i])
            return False
    print("OK!")
    return True

# Preprocessing

We will use the standard image preprocessor from HuggingFace.

In [None]:
image = Image.open("image.jpg")
plt.title("Input image")
plt.imshow(image)

In [None]:
image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
inputs = image_processor(images=image, return_tensors="pt")
value = inputs["pixel_values"][0].permute(1, 2, 0)
value = value - value.min()
value = value / value.max()
plt.title("Preprocessed")
plt.imshow(value)
plt.show()

# Apply the model

We will use the pretrained YOLOS model from HuggingFace.

In [None]:
model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

with torch.no_grad():
    outputs = model(**inputs)
logits = outputs["logits"]
boxes = outputs["pred_boxes"]
print("Logits:", logits.shape)
print("Boxes:", boxes.shape)

# Convert boxes from (center_x, center_y, width, height) to (left, upper, right, bottom).
boxes = center_to_corners_format(boxes)

probs = torch.nn.functional.softmax(logits, dim=-1)  # (B, N, C).

# The last class is reserved for "no object".
no_obj_probs = probs[:, :, -1]  # (B, N).
obj_probs = 1 - no_obj_probs  # (B, N).

def draw_boxes(image, boxes, probs):
    plt.imshow(image)
    image_width, image_height = image.size
    for p, box in zip(probs, boxes):
        score, label = p.max(dim=-1)
        if score < CONFIDENCE_THRESHOLD:
            continue
        if label == 91:
            # No object.
            continue
        print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
        )
        l, u, r, b = box.tolist()
        l, r = l * image_width, r * image_width
        u, b = u * image_height, b * image_height
        w = r - l
        h = b - u
        plt.gca().add_patch(patches.Rectangle(
            (l, u), w, h,
            linewidth=1, edgecolor='r', facecolor='none'
        ))
draw_boxes(image, boxes[0], probs[0])

We see many overlapping boxes for the ship class. We will apply NMS to solve the problem.

# Assignment 1. Implement IoU computation

Intersection-over-union (IoU) is computed as

$\mathrm{IoU}(box_1, box_2) = \frac{\mathrm{IntersectionArea}(box_1, box_2)}{\mathrm{UnionArea}(box_1, box_2)}$

Union area can be computed as a sum of boxes areas minus their intersection area.

In [None]:
def batch_iou(reference, boxes):
    """Compute IoU for a batch of boxes.

    Input boxes are in the format [left, top, right, bottom].
    
    Input:
    1. reference: A reference box with shape (4).
    2. boxes: Alternative boxes with shape (N, 4).
    
    Returns:
    An array with length (N) with IoUs between reference and boxes.
    """
    result = torch.empty(len(boxes))

    # Your code starts here.
    reference_area = (reference[2] - reference[0]) * (reference[3] - reference[1])
    for i, b in enumerate(boxes):
        inter_left = max(reference[0], b[0])
        inter_top = max(reference[1], b[1])
        inter_right = min(reference[2], b[2])
        inter_bottom = min(reference[3], b[3])
        inter_w = max(inter_right - inter_left, 0)
        inter_h = max(inter_bottom - inter_top, 0)
        inter_area = inter_w * inter_h
        b_area = (b[2] - b[0]) * (b[3] - b[1])
        union_area = reference_area + b_area - inter_area
        result[i] = inter_area / union_area
    # The end of your code.

    return result

assert check_batch_iou(batch_iou)

# Assignment 2. Implement Non-Maximum Suppression

NMS algorithm:
1. Order boxes by object probability in the descending order.
2. Start from the beginning. For each box:
    1. Compute IoU with the remaining boxes.
    2. Remove the boxes with IoU greater than the threshold.
4. Return the boxes that were kept.

**Note** that the last probability corresponds to the "no object" class.

In [None]:
@torch.no_grad()
def non_maximum_suppression(boxes, probs, iou_threshold=0.5):
    """Apply Non-maximum Suppression.

    Inputs:
        boxes: Tensor with shape (N, 4) in the (left, top, right, bottom) format.
        probs: Class probabilities with shape (N, C + 1), with the last class reserved for "no object".
    
    Returns:
        nms_boxes: Result boxes with shape (K, 4).
        nms_probs: Corresponding probabilities with shape (K, C + 1).
    """
    
    # Your code starts here.
    mask = torch.ones(len(boxes), dtype=torch.bool)
    obj_probs = 1 - probs[:, -1]
    order = torch.argsort(obj_probs, descending=True)
    boxes = boxes[order]
    probs = probs[order]
    for i, box in enumerate(boxes):
        ious = batch_iou(box, boxes[i + 1:])
        mask[i + 1:] &= ious < iou_threshold
    nms_boxes, nms_probs = boxes[mask], probs[mask]
    # The end of your code.
    
    return nms_boxes, nms_probs

nms_boxes, nms_probs = non_maximum_suppression(boxes[0], probs[0])
print("Boxes after NMS:", len(nms_boxes))
draw_boxes(image, nms_boxes, nms_probs)

NMS must keep a single ship detection and a single cat detection.