**IMPORTS**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import cv2

#for loading weights
import struct
from collections import defaultdict

#utils functions
import random 
import colorsys
import scipy
import imghdr
from PIL import Image, ImageDraw, ImageFont

import torch
import torch.nn as nn #building block to create and train NN
import torch.nn.functional as F #build network layers
import torch.optim as optim #optimized gradient descent

import torchvision.transforms as transforms #transforms images to tensors
import torchvision.models as models #pre-trained models
import torchvision.ops as ops #operators specific for computer vision

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**YOLO FILTER BOXES (BASED ON PROBABILITY THRESHOLD)**

In [None]:
def yolo_filter_boxes(box_confidence, boxes, box_class_probs,  threshold = 0.6):
    
    box_scores = box_confidence * box_class_probs

    box_classes = torch.argmax(box_scores,axis=-1)
    box_class_scores = torch.max(box_scores,axis=-1,keepdim=False)[0].squeeze()
    
    filtering_mask = (box_class_scores >= threshold)

    scores = torch.masked_select(box_class_scores, filtering_mask)
    boxes = torch.masked_select(boxes, filtering_mask.unsqueeze(dim=-1)).reshape(-1,4)
    classes = torch.masked_select(box_classes, filtering_mask)

    return scores, boxes, classes

In [None]:
torch.manual_seed(1)

box_confidence = torch.empty(19,19,5,1).normal_(mean=1,std=4)
boxes = torch.empty(19,19,5,4).normal_(mean=1,std=4)
box_class_probs = torch.empty(19,19,5,80).normal_(mean=1,std=4)

scores, boxes, classes = yolo_filter_boxes(box_confidence, boxes, box_class_probs, threshold = 0.5)

print("scores[2] = " + str(scores[2]))
print("boxes[2] = " + str(boxes[2]))
print("classes[2] = " + str(classes[2]))
print("scores.shape = " + str(scores.shape))
print("boxes.shape = " + str(boxes.shape))
print("classes.shape = " + str(classes.shape))

scores[2] = tensor(22.5610)
boxes[2] = tensor([ 5.3702, -1.3419,  7.4093,  4.6440])
classes[2] = tensor(71)
scores.shape = torch.Size([1794])
boxes.shape = torch.Size([1794, 4])
classes.shape = torch.Size([1794])


**INTERSECTION OVER UNION**

In [None]:
def iou(box1,box2):
    (box1_x1, box1_y1, box1_x2, box1_y2) = box1
    (box2_x1, box2_y1, box2_x2, box2_y2) = box2

    xi1 = max(box1_x1,box2_x1)
    yi1 = max(box1_y1,box2_y1)
    xi2 = min(box1_x2,box2_x2)
    yi2 = min(box1_y2,box2_y2)

    intersection_width = xi2 - xi1
    intersection_height = yi2 - yi1
    intersection_area = max(intersection_width,0) * max(intersection_height,0)

    box1_area = (box1_x2 - box1_x1) * (box1_y2 - box1_y1)
    box2_area = (box2_x2 - box2_x1) * (box2_y2 - box2_y1)
    union_area = box1_area + box2_area - intersection_area

    iou = intersection_area / union_area
    return iou

In [None]:
## Test case 1: boxes intersect
box1 = (2, 1, 4, 3)
box2 = (1, 2, 3, 4) 
print("iou for intersecting boxes = " + str(iou(box1, box2)))

## Test case 2: boxes do not intersect
box1 = (1,2,3,4)
box2 = (5,6,7,8)
print("iou for non-intersecting boxes = " + str(iou(box1,box2)))

## Test case 3: boxes intersect at vertices only
box1 = (1,1,2,2)
box2 = (2,2,3,3)
print("iou for boxes that only touch at vertices = " + str(iou(box1,box2)))

## Test case 4: boxes intersect at edge only
box1 = (1,1,3,3)
box2 = (2,3,3,4)
print("iou for boxes that only touch at edges = " + str(iou(box1,box2)))

iou for intersecting boxes = 0.14285714285714285
iou for non-intersecting boxes = 0.0
iou for boxes that only touch at vertices = 0.0
iou for boxes that only touch at edges = 0.0


**CONVERT [MIDPOINT, HEIGHT, WIDTH] TO [TOP LEFT, BOTTOM RIGHT]**

In [None]:
def convert_coordinates(x):
    #bx, by, bw, bh >>>> xmin, ymin, xmax, ymax
    y = x.new(x.shape)
    y[..., 0] = x[..., 0] - x[..., 2] / 2
    y[..., 1] = x[..., 1] - x[..., 3] / 2
    y[..., 2] = x[..., 0] + x[..., 2] / 2
    y[..., 3] = x[..., 1] + x[..., 3] / 2
    
    return y

In [None]:
convert_coordinates(torch.tensor([0,0,3,5],dtype=torch.float32))

tensor([-1.5000, -2.5000,  1.5000,  2.5000])

**NON MAX SUPPRESSION**

In [None]:
def yolo_non_max_suppression(boxes, classes, scores, max_boxes = 10, iou_threshold = 0.2):

    nms_indices = ops.nms(boxes, scores, iou_threshold)

    scores = torch.gather(scores, 0, nms_indices)
    classes = torch.gather(classes, 0, nms_indices)
    boxes = boxes[nms_indices[:]]

    return scores, boxes, classes

In [None]:
scores, boxes, classes = yolo_non_max_suppression(boxes, classes, scores)

print(scores.shape)
print(boxes.shape)
print(classes.shape)

print("scores[2] = " + str(scores[2]))
print("boxes[2] = " + str(boxes[2]))
print("classes[2] = " + str(classes[2]))
print("scores.shape = " + str(scores.shape))
print("boxes.shape = " + str(boxes.shape))
print("classes.shape = " + str(classes.shape))

torch.Size([1470])
torch.Size([1470, 4])
torch.Size([1470])
scores[2] = tensor(150.3171)
boxes[2] = tensor([11.5748, -0.2527,  4.5467,  0.7380])
classes[2] = tensor(10)
scores.shape = torch.Size([1470])
boxes.shape = torch.Size([1470, 4])
classes.shape = torch.Size([1470])


**WRAPPING UP THE FILTERING**

In [None]:
def yolo_eval(yolo_outputs, image_shape = (720., 1280.), max_boxes=10, score_threshold=.6, iou_threshold=.5):
    
    #Retrieve outputs of the yolo model
    box_confidence, box_xywh, box_class_probs = yolo_outputs
    #Convert bx,by,bh,bw to x1,y1,x2,y2
    boxes = convert_coordinates(box_xywh)
    #Filter using score_threshold
    scores, boxes, classes = yolo_filter_boxes(box_confidence, boxes, box_class_probs, score_threshold)
    #Rescale boxes to original image shape
    #boxes = scale_boxes(boxes, image_shape)
    #Perform non max suppression given iou threshold
    scores, boxes, classes = yolo_non_max_suppression(boxes, classes, scores, iou_threshold)
    
    return scores, boxes, classes

In [None]:
yolo_outputs = (torch.rand(1,19,19,5,1),
                torch.rand(1,19,19,5,4),
                torch.rand(1,19,19,5,80))
                    
scores, boxes, classes = yolo_eval(yolo_outputs)

print("scores[2] = " + str(scores[2]))
print("boxes[2] = " + str(boxes[2]))
print("classes[2] = " + str(classes[2]))
print("scores.shape = " + str(scores.shape))
print("boxes.shape = " + str(boxes.shape))
print("classes.shape = " + str(classes.shape))

scores[2] = tensor(0.9936)
boxes[2] = tensor([-0.1124,  0.4062,  0.6784,  0.6014])
classes[2] = tensor(1)
scores.shape = torch.Size([113])
boxes.shape = torch.Size([113, 4])
classes.shape = torch.Size([113])


**Summary for YOLO**:
- Input image (608, 608, 3)
- The input image goes through a CNN, resulting in a (19,19,5,85) dimensional output. 
- After flattening the last two dimensions, the output is a volume of shape (19, 19, 425):
    - Each cell in a 19x19 grid over the input image gives 425 numbers. 
    - 425 = 5 x 85 because each cell contains predictions for 5 boxes, corresponding to 5 anchor boxes, as seen in lecture. 
    - 85 = 5 + 80 where 5 is because $(p_c, b_x, b_y, b_h, b_w)$ has 5 numbers, and 80 is the number of classes we'd like to detect
- You then select only few boxes based on:
    - Score-thresholding: throw away boxes that have detected a class with a score less than the threshold
    - Non-max suppression: Compute the Intersection over Union and avoid selecting overlapping boxes
- This gives you YOLO's final output. 

**DEFINING ANCHORS**

In [None]:
def read_anchors(anchors_path):
    with open(anchors_path) as f:
        anchors = f.readline()
        anchors = [float(x) for x in anchors.split(',')]
        anchors = np.array(anchors).reshape(-1, 2)
    return anchors

In [None]:
anchors_path = '/content/gdrive/My Drive/yolo_anchors.txt'
anchors = read_anchors(anchors_path)
anchors

array([[0.57273 , 0.677385],
       [1.87446 , 2.06253 ],
       [3.33843 , 5.47434 ],
       [7.88282 , 3.52778 ],
       [9.77052 , 9.16828 ]])

**DEFINING CLASSES**

In [None]:
def read_classes(classes_path):
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names

In [None]:
classes_path = '/content/gdrive/My Drive/coco_classes.txt'
class_names = read_classes(classes_path)
len(class_names)

80

**BUILDING THE MODEL**

In [None]:
def space_to_depth(x, block_size):
    n, c, h, w = x.size()
    unfolded_x = torch.nn.functional.unfold(x, block_size, stride=block_size)
    return unfolded_x.view(n, c * block_size ** 2, h // block_size, w // block_size)

In [None]:
class Yolov2(nn.Module):
    
    def __init__(self):
        super(Yolov2, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm1 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm2 = nn.BatchNorm2d(64)
        
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False)
        self.batchnorm4 = nn.BatchNorm2d(64)
        self.conv5 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm5 = nn.BatchNorm2d(128)
        
        self.conv6 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm6 = nn.BatchNorm2d(256)
        self.conv7 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0, bias=False)
        self.batchnorm7 = nn.BatchNorm2d(128)
        self.conv8 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm8 = nn.BatchNorm2d(256)
        
        self.conv9 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm9 = nn.BatchNorm2d(512)
        self.conv10 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0, bias=False)
        self.batchnorm10 = nn.BatchNorm2d(256)
        self.conv11 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm11 = nn.BatchNorm2d(512)
        self.conv12 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0, bias=False)
        self.batchnorm12 = nn.BatchNorm2d(256)
        self.conv13 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm13 = nn.BatchNorm2d(512)
        
        self.conv14 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm14 = nn.BatchNorm2d(1024)
        self.conv15 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0, bias=False)
        self.batchnorm15 = nn.BatchNorm2d(512)
        self.conv16 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm16 = nn.BatchNorm2d(1024)
        self.conv17 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0, bias=False)
        self.batchnorm17 = nn.BatchNorm2d(512)
        self.conv18 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm18 = nn.BatchNorm2d(1024)

        self.conv19 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm19 = nn.BatchNorm2d(1024)
        self.conv20 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm20 = nn.BatchNorm2d(1024)

        self.conv21 = nn.Conv2d(in_channels=512, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False)
        self.batchnorm21 = nn.BatchNorm2d(64)

        self.conv22 = nn.Conv2d(in_channels=1280, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
        self.batchnorm22 = nn.BatchNorm2d(1024)
        
        self.conv23 = nn.Conv2d(in_channels=1024, out_channels=425, kernel_size=1, stride=1, padding=0, bias = True)
        

    def forward(self, x):
        out = F.max_pool2d(F.leaky_relu(self.batchnorm1(self.conv1(x)), negative_slope=0.1), 2, stride=2)
        out = F.max_pool2d(F.leaky_relu(self.batchnorm2(self.conv2(out)), negative_slope=0.1), 2, stride=2)
        
        out = F.leaky_relu(self.batchnorm3(self.conv3(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm4(self.conv4(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm5(self.conv5(out)), negative_slope=0.1)
        out = F.max_pool2d(out, 2, stride=2)
        
        out = F.leaky_relu(self.batchnorm6(self.conv6(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm7(self.conv7(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm8(self.conv8(out)), negative_slope=0.1)
        out = F.max_pool2d(out, 2, stride=2)

        out = F.leaky_relu(self.batchnorm9(self.conv9(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm10(self.conv10(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm11(self.conv11(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm12(self.conv12(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm13(self.conv13(out)), negative_slope=0.1)
        
        passthrough = F.leaky_relu(self.batchnorm21(self.conv21(out)), negative_slope=0.1)
        passthrough = space_to_depth(passthrough,2)
        
        out = F.max_pool2d(out, 2, stride=2)
        out = F.leaky_relu(self.batchnorm14(self.conv14(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm15(self.conv15(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm16(self.conv16(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm17(self.conv17(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm18(self.conv18(out)), negative_slope=0.1)

        out = F.leaky_relu(self.batchnorm19(self.conv19(out)), negative_slope=0.1)
        out = F.leaky_relu(self.batchnorm20(self.conv20(out)), negative_slope=0.1)
        
        out = torch.cat([passthrough, out], 1)
        out = F.leaky_relu(self.batchnorm22(self.conv22(out)), negative_slope=0.1)
        out = self.conv23(out)

        return out

In [None]:
model = Yolov2()

In [None]:
#model.conv23.weight == torch.tensor(weights[-435200:-1])

In [None]:
model.conv22.weight[31,2,2,:]

tensor([-0.0062, -0.0063,  0.0035], grad_fn=<SliceBackward>)

**LOADING WEIGHTS**

In [None]:
weightfile = '/content/gdrive/My Drive/yolov2.weights'
fp = open(weightfile, "rb")

#The first 5 values are header information 
# 1. Major version number
# 2. Minor Version Number
# 3. Subversion number 
# 4. Images seen by the network (during training)
header = np.fromfile(fp, dtype = np.int32, count = 4)
weights = np.fromfile(fp, dtype = np.float32)

In [None]:
num_weights = int(len(weights))
params = sum([p.numel() for p in model.parameters()])
print('Total params: ',num_weights)
print('Trainable params: ',params)
print('Non trainable params: ',num_weights-params)

Total params:  50983561
Trainable params:  50962889
Non trainable params:  20672


In [None]:
group_mapping = defaultdict(lambda: defaultdict())
cnt = 0
for child in model.children():
    if type(child) == nn.Conv2d:
        cnt += 1
        if cnt == 23:
            group_mapping['conv'+str(cnt)] = child
        if cnt > 22:
            break
        group_mapping['conv'+str(cnt)] = child
        #group_mapping['bias'+str(cnt)] = child
    else:
        group_mapping['bias'+str(cnt)] = child

for i,j in group_mapping.items():
    print(i,j)

conv1 Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
bias1 BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
conv2 Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
bias2 BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
conv3 Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
bias3 BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
conv4 Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
bias4 BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
conv5 Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
bias5 BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
conv6 Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
bias6 BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_s

In [None]:
f = open(weightfile, 'rb')
major, minor, revision, seen = struct.unpack('4i', f.read(16))

for i in range(1, 24):
    if i == 23:
            conv_var = group_mapping['conv'+str(i)]
            c_out, c_in, f1, f2 = conv_var.weight.size()
            cnt = int(c_out * c_in * f1 * f2)
            p = struct.unpack('%df' % cnt, f.read(4*cnt))
            conv_var.weight.data = torch.from_numpy(np.reshape(p, [c_out, c_in, f1, f2])).float()
            for param in conv_var.parameters():
                param.requires_grad = False
            break

    bias_var = group_mapping['bias'+str(i)]
    cnt = int(bias_var.bias.size()[0])
    bias_var.bias.data = torch.from_numpy(np.array(struct.unpack('%df' % cnt, f.read(4*cnt)))).float()
    bias_var.weight.data = torch.from_numpy(np.array(struct.unpack('%df' % cnt, f.read(4*cnt)))).float()
    bias_var.running_mean = torch.from_numpy(np.array(struct.unpack('%df' % cnt, f.read(4*cnt)))).float()
    bias_var.running_var = torch.from_numpy(np.array(struct.unpack('%df' % cnt, f.read(4*cnt)))).float()
    for param in bias_var.parameters():
        param.requires_grad = False
    
    conv_var = group_mapping['conv'+str(i)]
    c_out, c_in, f1, f2 = conv_var.weight.size()
    cnt = int(c_out * c_in * f1 * f2)
    p = struct.unpack('%df' % cnt, f.read(4*cnt))
    conv_var.weight.data = torch.from_numpy(np.reshape(p, [c_out, c_in, f1, f2])).float()
    for param in conv_var.parameters():
        param.requires_grad = False


In [None]:
model.batchnorm2.bias == torch.tensor(weights[128+864:128+864+64])

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True])

**YOLO HEAD**

In [None]:
def yolo_head(feats, anchors, num_classes):
    """Convert final layer features to bounding box parameters.
    Parameters
    ----------
    feats : tensor
        Final convolutional layer features.
    anchors : array-like
        Anchor box widths and heights.
    num_classes : int
        Number of target classes.
    Returns
    -------
    box_xy : tensor
        x, y box predictions adjusted by spatial location in conv layer.
    box_wh : tensor
        w, h box predictions adjusted by anchors and conv spatial resolution.
    box_conf : tensor
        Probability estimate for whether each box contains any object.
    box_class_pred : tensor
        Probability distribution estimate for each box over class labels.
    """
    num_anchors = len(anchors)
    # Reshape to batch, height, width, num_anchors, box_params.
    anchors_tensor = torch.reshape(torch.tensor(anchors), (1, 1, 1, num_anchors, 2))

    # Static implementation for fixed models.
    # TODO: Remove or add option for static implementation.
    # _, conv_height, conv_width, _ = K.int_shape(feats)
    # conv_dims = K.variable([conv_width, conv_height])

    # Dynamic implementation of conv dims for fully convolutional model.
    conv_dims = feats.shape[1:3]  # assuming channels last
    # In YOLO the height index is the inner most iteration.
    conv_height_index = torch.arange(0, end=conv_dims[0])
    conv_width_index = torch.arange(0, end=conv_dims[1])
    conv_height_index = conv_height_index.repeat(conv_dims[1])

    # TODO: Repeat_elements and tf.split doesn't support dynamic splits.
    # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0)
    conv_width_index = (torch.unsqueeze(conv_width_index, 0)).repeat(conv_dims[0], 1)

    conv_width_index = torch.flatten(torch.transpose(conv_width_index,0,1))
    conv_index = torch.transpose(torch.stack([conv_height_index, conv_width_index]),0,1)
    conv_index = torch.reshape(conv_index, (1, conv_dims[0], conv_dims[1], 1, 2))
    conv_index.type(feats.dtype)

    feats = torch.reshape(
        feats, [-1, conv_dims[0], conv_dims[1], num_anchors, 80 + 5])
    conv_dims = torch.reshape(torch.tensor(conv_dims), (1, 1, 1, 1, 2))
    conv_dims.type(feats.dtype)

    # Static generation of conv_index:
    # conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)])
    # conv_index = conv_index[:, [1, 0]]  # swap columns for YOLO ordering.
    # conv_index = K.variable(
    #     conv_index.reshape(1, conv_height, conv_width, 1, 2))
    # feats = Reshape(
    #     (conv_dims[0], conv_dims[1], num_anchors, num_classes + 5))(feats)

    box_xy = torch.sigmoid(feats[..., :2])
    box_wh = torch.exp(feats[..., 2:4])
    box_confidence = torch.sigmoid(feats[..., 4:5])
    box_class_probs = torch.softmax(feats[..., 5:], 0, dtype=feats.dtype)

    # Adjust preditions to each spatial grid point and anchor size.
    # Note: YOLO iterates over height index before width index.
    box_xy = ((box_xy + conv_index) / conv_dims).type(feats.dtype)
    box_wh = (box_wh * anchors_tensor / conv_dims).type(feats.dtype)

    box = torch.cat((box_xy,box_wh),-1)

    return box, box_confidence, box_class_probs

In [None]:
t = torch.rand(1,19,19,425)
box, box_confidence, box_class_probs = yolo_head(t, anchors, len(classes))

print(box_confidence.shape)
print(box.shape)
print(box_class_probs.shape)

torch.Size([1, 19, 19, 5, 1])
torch.Size([1, 19, 19, 5, 4])
torch.Size([1, 19, 19, 5, 80])


**LOADING IMAGE**

In [None]:
transform = transforms.Compose([
    transforms.Resize((608,608)),
    transforms.ToTensor()])

def image_loader(img_dir): # PIL image to 4d tensor (B,C,H,W)
    image = Image.open(img_dir)
    image = transform(image).unsqueeze(0)
    return image

img_dir = '/content/gdrive/My Drive/800.jpeg'
model_inputs = image_loader(img_dir)

In [None]:
#yolo model input image
model_inputs.shape

torch.Size([1, 3, 608, 608])

**INPUT CYCLE SUMMARY**

1. <font color='purple'> yolo_model.input </font> is given to `yolo_model`. The model is used to compute the output <font color='purple'> yolo_model.output </font>
2. <font color='purple'> yolo_model.output </font> is processed by `yolo_head`. It gives you <font color='purple'> yolo_outputs </font>
3. <font color='purple'> yolo_outputs </font> goes through a filtering function, `yolo_eval`. It outputs your predictions: <font color='purple'> scores, boxes, classes </font>

**YOLO BODY**

In [None]:
def yolo_body(model, img_dir):
    
    #retrieve image from img_dir and convert to tensor
    model_inputs = image_loader(img_dir)
    #forward pass through the model
    model_outputs = model.forward(model_inputs)
    #reshape the model_outputs from b,c,h,w to b,h,w,c
    model_outputs = model_outputs.permute(0,3,2,1)
    #break down the model_outputs to box_confidence (pc), boxes (bx,by,bw,bh), box_class_probs (c1,c2..,c80)
    boxes, box_confidence, box_class_probs = yolo_head(model_outputs, anchors, len(class_names))
    #combine the box_confidence, boxes, box_class_probs to tuple yolo_outputs
    yolo_outputs = (box_confidence, boxes, box_class_probs)
    #perform boxes selection using threshold and non max suppression
    yolo_scores, yolo_boxes, yolo_classes = yolo_eval(yolo_outputs)
    #combine yolo_scores, yolo_boxes, yolo_classes to tuple predictions
    predictions = (yolo_scores,yolo_boxes,yolo_classes)
    #pass the predictions to draw the new image
    out_scores, out_boxes, out_classes = predict(predictions, img_dir)
    
    return out_scores, out_boxes, out_classes

In [1]:
#yolo_body(model,img_dir)

**UTILS FUNCTIONS**

**GENERATE COLORS**

In [None]:
def generate_colors(class_names):
    hsv_tuples = [(x / len(class_names), 1., 1.) for x in range(len(class_names))]
    colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
    colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors))
    random.seed(10101)  # Fixed seed for consistent colors across runs.
    random.shuffle(colors)  # Shuffle colors to decorrelate adjacent classes.
    random.seed(None)  # Reset seed to default.
    return colors


**PRE PROCESS IMAGE**

In [None]:
def preprocess_image(img_path, model_image_size):
    image_type = imghdr.what(img_path)
    image = Image.open(img_path)
    resized_image = image.resize(tuple(reversed(model_image_size)), Image.BICUBIC)
    image_data = np.array(resized_image, dtype='float32')
    image_data /= 255.
    image_data = np.expand_dims(image_data, 0)  # Add batch dimension.
    return image, image_data

**SCALE BOXES**

In [None]:
def scale_boxes(boxes, image_shape):
    """ Scales the predicted boxes in order to be drawable on the image"""
    height = image_shape[0]
    width = image_shape[1]
    image_dims = K.stack([height, width, height, width])
    image_dims = K.reshape(image_dims, [1, 4])
    boxes = boxes * image_dims
    return boxes

**DRAW BOXES**

In [None]:
def draw_boxes(image, out_scores, out_boxes, out_classes, class_names, colors):
    
    #font = ImageFont.truetype(font='font/FiraMono-Medium.otf',size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
    thickness = (image.size[0] + image.size[1]) // 300

    for i, c in reversed(list(enumerate(out_classes))):
        predicted_class = class_names[c]
        box = out_boxes[i]
        score = out_scores[i]

        label = '{} {:.2f}'.format(predicted_class, score)

        draw = ImageDraw.Draw(image)
        label_size = draw.textsize(label)

        top, left, bottom, right = box
        top = max(0, np.floor(top + 0.5).astype('int32'))
        left = max(0, np.floor(left + 0.5).astype('int32'))
        bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
        right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
        print(label, (left, top), (right, bottom))

        if top - label_size[1] >= 0:
            text_origin = np.array([left, top - label_size[1]])
        else:
            text_origin = np.array([left, top + 1])

        # My kingdom for a good redistributable image drawing library.
        for i in range(thickness):
            draw.rectangle([left + i, top + i, right - i, bottom - i], outline=colors[c])
        draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=colors[c])
        #draw.text(text_origin, label, fill=(0, 0, 0))
        del draw

**RUN THE GRAPH ON AN IMAGE**

In [None]:
def predict(predictions, image_file):
    """
    Arguments:
    image_file -- name of an image stored in the "images" folder.
    
    Returns:
    out_scores -- tensor of shape (None, ), scores of the predicted boxes
    out_boxes -- tensor of shape (None, 4), coordinates of the predicted boxes
    out_classes -- tensor of shape (None, ), class index of the predicted boxes
    
    Note: "None" actually represents the number of predicted boxes, it varies between 0 and max_boxes. 
    """

    # Preprocess your image
    image, image_data = preprocess_image(image_file, model_image_size = (608, 608))

    out_scores, out_boxes, out_classes = predictions
    out_scores = out_scores.detach().numpy()
    out_boxes = out_boxes.detach().numpy()
    out_classes = out_classes.detach().numpy()

    # Print predictions info
    print('Found {} boxes for input image'.format(len(out_boxes)))
    # Generate colors for drawing bounding boxes.
    colors = generate_colors(class_names)
    # Draw bounding boxes on the image file
    draw_boxes(image, out_scores, out_boxes, out_classes, class_names, colors)
    # Display the results in the notebook
    plt.imshow(image)
    
    return out_scores, out_boxes, out_classes