In [17]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [18]:
# This is from the open source repo: https://github.com/sugey/pytorch-yolov3

# Let's talk docs!!
def predict_transform(prediction, inp_dim, anchors, num_classes, train=False):
    """
    Takes a detection layer feature map and turns it into a 2D tensor where
    every row is a list with objectness (P0) bounding box coordinates (Bx, By, Bw, Bh),
    and class probabilities for each anchor.

    Params:
        prediction: tensor from prediction, Batch x Channels x Width x Height
            example: 416 x 416 input, first YOLO detection prediction: torch.Size([1, 255, 13, 13])
        inp_dim: input image dimensions
        anchors: list of anchors
        num_classes: number of classes for prediction. VOC is usually 20, COCO=80
        CUDA: CUDA flag

    Returns:
        prediction for classes, for 13x13 is a torch.Size([1, 507, 85]), bs x (anchor * grid * grid) x (classes + bboxs + 1)
    """

# In the actual code, we use this as a function, but we're going to break it up for explanation

In [19]:
#We're going to keep everything on CPU. Normally, we would push a lot of this to GPU with .cuda() or to_device(cuda)

# Let's load the function params as dummy data
tensor = torch.tensor((), dtype=torch.int32)
prediction = tensor.new_ones((1, 255, 13, 13))
inp_dim = 416
anchors = [(116, 90), (156, 198), (373, 326)]
num_classes = 80

print(f"Prediction input size: {prediction.size()}")

Prediction input size: torch.Size([1, 255, 13, 13])


In [20]:

# size of the current batch of the detection layer
batch_size = prediction.size(0)
# Determined by dividing the full image size by the width of the feature map,
# which is also the size of the region maps. 416 x 416 = 13, 26, 52
# for region map sizes among the 3 detection layers.
stride = inp_dim // prediction.size(2)
# This is the number of regions, so for an input image of
# 416 x 416, goes in sizes of 13, 26, 52
grid_size = inp_dim // stride
# Boxes always have at least 5 slots, 1 for the objectness score (what is the
# probability an object is in this grid box) and 4 dimensional attributes
# of center coodinates + width + height or Bx, By, Bw, Bh
bbox_attrs = 5 + num_classes
# Obviously, number of anchor boxes we decided to use
num_anchors = len(anchors)

# We need to reduce the anchors from their size on the full image
# to the reduced grid sizes, and resave them in a list.
anchors = [(a[0]/stride, a[1]/stride) for a in anchors]

print(f"batch_size: {batch_size}")
print(f"stride: {stride}")
print(f"grid_size: {grid_size}")
print(f"bbox_attrs: {bbox_attrs}")
print(f"num_anchors: {num_anchors}")
print(f"anchors: {anchors}")

batch_size: 1
stride: 32
grid_size: 13
bbox_attrs: 85
num_anchors: 3
anchors: [(3.625, 2.8125), (4.875, 6.1875), (11.65625, 10.1875)]


In [21]:
# We need to get the prediction map data down to a 2D tensor with tensor data turned into bounding boxes on 416 x 416 image, generates tensor of 1 deep (flat) x 255 x 169
prediction = prediction.view(
    batch_size, bbox_attrs*num_anchors, grid_size*grid_size)

print(f"Prediction Tensor size: {prediction.size()}")

Prediction Tensor size: torch.Size([1, 255, 169])


In [22]:
# Generates tensor and rotates it to 1 x 169 x 255
prediction = prediction.transpose(1, 2).contiguous()

print(f"Prediction Tensor size: {prediction.size()}")

Prediction Tensor size: torch.Size([1, 169, 255])


In [23]:
# On the 416 image, creates a 3D tensor of 1 x 507 x 85. This accounts for every anchor in every grid region. In the case of a 0 threshold, all
# of these boxes would appear on an image.
prediction = prediction.view(
    batch_size, grid_size*grid_size*num_anchors, bbox_attrs)

print(f"Prediction Tensor size: {prediction.size()}")

Prediction Tensor size: torch.Size([1, 507, 85])


In [24]:
# We stop here for training normally in the function.  Inferencing will require a few more steps, which we won't cover today since they're not related to re-shaping as much.
if train:
    return prediction

SyntaxError: 'return' outside function (<ipython-input-24-96e45412e801>, line 3)

In [26]:
# What if we wanted to shrink the torch.Size([1, 507, 85]) to a torch.Size([507, 85])
# The parameter is the index shape to remove
prediction_ = prediction.squeeze(0)
print(f"Prediction Tensor size: {prediction_.size()}")

Prediction Tensor size: torch.Size([507, 85])


In [27]:
# What if we wanted to expand the torch.Size([507, 85]) back to a torch.Size([1, 507, 85])
# The parameter is the index shape to add
prediction_1 = prediction_.unsqueeze(0)
print(f"Prediction Tensor size: {prediction_1.size()}")

Prediction Tensor size: torch.Size([1, 507, 85])
