### Here is the implementation of YOLOv3 loss function. It is differnet from v1. see https://towardsdatascience.com/yolo-v3-object-detection-53fb7d3bfe6b

#### Before this, let's recap what have we done.  Dataloader reads image data and annotations. Darknet53, this net extracts features. Yolo_layers do the detections.
#### The yolo layers return 3 scale outputs: (13\*13 + 26\*26 + 52\*52)\*3\*(80+4+1). Here we using outputs and annotations to calculate the loss.

In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def build_targets(model, targets, pred): #
    # targets: torch.Size([2, 6]), from the dataloader, 2 means num of objs readed from batch iamges. index + 类别 + 坐标 = 6, see 04_DataLoader.ipynb
    # pred:  pred: [[1,3,13,13,85],[1,3,26,26,85],[1,3,52,52,85]] from the net output
    
    # targets = [image, class, x, y, w, h]
    
    if isinstance(model, nn.DataParallel):
        model = model.module
    yolo_layers = get_yolo_layers(model) # [82, 94, 106] for yolov3

    # anchors = closest_anchor(model, targets)  # [layer, anchor, i, j]
    txy, twh, tcls, tconf, indices = [], [], [], [], []
    for i, layer in enumerate(yolo_layers):
        nG = model.module_list[layer][0].nG  # grid size 13,26,52
        anchor_vec = model.module_list[layer][0].anchor_vec #size:[3,2],tensor([[ 3.62500,  2.81250],[ 4.87500,  6.18750],[11.65625, 10.18750]]) ? 

        # iou of targets-anchors
        gwh = targets[:, 4:6] * nG # w,h * nG, tensor([[ 6.81248,  1.68392],[11.79978,  8.35538]]), 2 obj * w,h
        iou = [wh_iou(x, gwh) for x in anchor_vec] # [3,2], 3 anchor_vec * 2 gwh
        iou, a = torch.stack(iou, 0).max(0)  # best iou and anchor
        # 来自dataloader的targets里是每个batch中的obj的坐标，将坐标转化为grid scale大小，计算与anchors的iou，找出最大的iou和对应的anchor

        # reject below threshold ious (OPTIONAL)
        reject = True
        if reject:
            j = iou > 0.01
            t, a, gwh = targets[j], a[j], gwh[j]
        else:
            t = targets

        # Indices
        b, c = t[:, 0:2].long().t()  # target image, class， b:img index, c:class label
        gxy = t[:, 2:4] * nG
        gi, gj = gxy.long().t()  # grid_i, grid_j
        indices.append((b, a, gj, gi))

        # XY coordinates
        txy.append(gxy - gxy.floor())

        # Width and height
        twh.append(torch.log(gwh / anchor_vec[a]))  # yolo method
        # twh.append(torch.sqrt(gwh / anchor_vec[a]) / 2)  # power method

        # Class
        tcls.append(c)

        # Conf
        tci = torch.zeros_like(pred[i][..., 0])
        tci[b, a, gj, gi] = 1  # conf
        tconf.append(tci)

    return txy, twh, tcls, tconf, indices
    # txy: list,3, 0:2*2, 1:2*2, 2:1*2
    # twh: list,3, 0:2*2, 1:2*2, 2:1*2
    # tcls: list,3, torch.Size([2]), torch.Size([2]), torch.Size([2])
    # tconf: list,3, torch.Size([b, 3, 13, 13]), torch.Size([b, 3, 26, 26]), torch.Size([b, 3, 52, 52])
    # indices: list,3, tuple(4), tuple(4), tuple(4)

In [24]:
def compute_loss(p, targets):  # predictions: [[1,3,13,13,85],[1,3,26,26,85],[1,3,52,52,85]] from the net output; targets: txy, twh, tcls, tconf, indices
    FT = torch.cuda.FloatTensor if p[0].is_cuda else torch.FloatTensor
    loss, lxy, lwh, lcls, lconf = FT([0]), FT([0]), FT([0]), FT([0]), FT([0])
    txy, twh, tcls, tconf, indices = targets 
    MSE = nn.MSELoss()
    CE = nn.CrossEntropyLoss()
    BCE = nn.BCEWithLogitsLoss()

    # Compute losses
    # gp = [x.numel() for x in tconf]  # grid points
    for i, pi0 in enumerate(p):  # layer i predictions, i
        b, a, gj, gi = indices[i]  # image, anchor, gridx, gridy

        # Compute losses
        k = 1  # nT / bs
        if len(b) > 0:
            pi = pi0[b, a, gj, gi]  # predictions closest to anchors
            lxy += k * MSE(torch.sigmoid(pi[..., 0:2]), txy[i])  # xy
            lwh += k * MSE(pi[..., 2:4], twh[i])  # wh
            lcls += (k / 4) * CE(pi[..., 5:], tcls[i])

        # pos_weight = FT([gp[i] / min(gp) * 4.])
        # BCE = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        lconf += (k * 64) * BCE(pi0[..., 4], tconf[i])
    loss = lxy + lwh + lconf + lcls

    # Add to dictionary
    d = defaultdict(float)
    losses = [loss.item(), lxy.item(), lwh.item(), lconf.item(), lcls.item()]
    for name, x in zip(['total', 'xy', 'wh', 'conf', 'cls'], losses):
        d[name] = x

    return loss, d