# Following this series:
https://www.youtube.com/watch?v=t-phGBfPEZ4&list=PLhhyoLH6Ijfw0TpCTVTNk42NN08H6UvNq

In [9]:
import torch.nn as nn
import torch
from torchinfo import summary
# from torchmetrics.detection import IntersectionOverUnion
from collections import *
# metrics from aladin peterson
# https://github.com/aladdinpersson/Machine-Learning-Collection
from utils.main import *
import os
import pandas as pd
import PIL as image
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2, ToTensor



In [2]:
cnn_config = [
  # kernel, filter, stride, padding
  (7,64,2,3),
  # max pool
  "M",
  (3,192,1,1),
  "M",
  (1,128,1,0),
  (3,256,1,0),
  (3,256,1,0),
  (3,512,1,1),
  "M",
  #CNN block with 4 identical layers
  [(1,256,1,0), (3,512,1,1), 4],
  (1,512,1,0),
  (3,1024,1,1),
  "M",
  [(1,512,1,0), (3,1024, 1,1), 2],
  (3,1024,1,1),
  (3,1024,2,1),
  (3,1024,1,1),
  (3,1024,1,1),
]

class CNNBlock(nn.Module):
  def __init__(self, out_channels, **kwargs):
    super(CNNBlock, self).__init__()
    #turn of bias to use BatchNorm
    self.conv = nn.LazyConv2d(out_channels, bias= False, **kwargs)
    self.batchnorm = nn.BatchNorm2d(num_features = out_channels)
    self.leaky_relu = nn.LeakyReLU(0.1)

  def forward(self, X):
    return self.leaky_relu(self.batchnorm(self.conv(X)))

class Yolov1(nn.Module):
  def __init__(self, **kwargs):
    super(Yolov1, self).__init__()
    self.cnn_config = cnn_config
    # self.in_channels = in_channels
    #pass in the cnn config to construct the model
    self.darknet = self._create_conv_layers(self.cnn_config)
    #fully connected layer creation
    self.fc = self._create_fc(**kwargs)

  def forward(self, X):
    X = self.darknet(X)
    return self.fc(torch.flatten(X, start_dim=1))

  def _create_conv_layers(self, config):
    layers = []
    # in_channels = self.in_channels

    for x in config:
      #conv layer
      if type(x) == tuple:
        kernel, out_channels, stride, padding = x
        layers.append(CNNBlock(out_channels,
                               kernel_size = kernel, stride = stride, padding = padding))
      elif type(x) == str:
        layers.append(nn.MaxPool2d(kernel_size = (2,2), stride = (2,2)))

      elif type(x) == list:
        conv1, conv2, repeats = x
        for _ in range(repeats):
          kernel1, out_channels1, stride1, padding1 = conv1
          layers.append(CNNBlock(out_channels1,
                                kernel_size = kernel1, stride = stride1, padding = padding1))

          kernel2, out_channels2, stride2, padding2 = conv2
          layers.append(CNNBlock(out_channels2,
                                kernel_size = kernel2, stride = stride2, padding = padding2))
          #according to the paper, for each repeated block, we take output channel of conv2
          #and feed it as in channel to the next block
          # in_channels = out_channels2
    #unpack a list [a,b,c,d,...] into a,b,c,d,...
    return nn.Sequential(*layers)


  def _create_fc(self, grid_size, num_boxes, num_classes):
    # pred vector should look like [c1,c2,...cN, p1, x1,y1,w1,h2, p2, x2,y2,w2,h2]
    S, B, C = grid_size, num_boxes, num_classes

    return nn.Sequential(
        nn.Flatten(),
        nn.LazyLinear(496), # original paper is 4096, reduce it to reduce training resources
        nn.Dropout(0.5),
        nn.LeakyReLU(0.1),
        #each cell has # of classes + # of boxes * 5 (5 because it's p,x,y,w,h)
        nn.Linear(496, S * S * (C + B * 5)), #Reshape to (S*S * 30)
    )



In [4]:
S = 7
B = 2
C = 20

model = Yolov1(grid_size = S, num_boxes = B, num_classes = C)
summary(model, input_size=(2,3,448,448))



Layer (type:depth-idx)                   Output Shape              Param #
Yolov1                                   [2, 1470]                 --
├─Sequential: 1-1                        [2, 1024, 7, 7]           --
│    └─CNNBlock: 2-1                     [2, 64, 224, 224]         --
│    │    └─Conv2d: 3-1                  [2, 64, 224, 224]         9,408
│    │    └─BatchNorm2d: 3-2             [2, 64, 224, 224]         128
│    │    └─LeakyReLU: 3-3               [2, 64, 224, 224]         --
│    └─MaxPool2d: 2-2                    [2, 64, 112, 112]         --
│    └─CNNBlock: 2-3                     [2, 192, 112, 112]        --
│    │    └─Conv2d: 3-4                  [2, 192, 112, 112]        110,592
│    │    └─BatchNorm2d: 3-5             [2, 192, 112, 112]        384
│    │    └─LeakyReLU: 3-6               [2, 192, 112, 112]        --
│    └─MaxPool2d: 2-4                    [2, 192, 56, 56]          --
│    └─CNNBlock: 2-5                     [2, 128, 56, 56]          --
│    

# Loss function
check formula in the paper

In [7]:
# The idea of this is each prediction gives 2 boxes
# There's only 1 box for target
# then get IoU of these 2 pred boxes on 1 target box

class YoloLoss(nn.Module):
  def __init__(self, S= 7,B = 2, C = 30):
    super(YoloLoss, self).__init__()

    self.mse = nn.MSELoss(reduction = 'sum')
    self.S = S
    self.B = B
    self.C = C
    self.lambda_noobj = 0.5
    self.lambda_coord = 5

  def forward(self, preds, target):
    #Input shape = 2 * 1470 ( from summary above)
    # Reshape to (-1, S, S, C + B * 5)
    #Preds = [c1,c2,...c20, p1, x1, y2, w1, h1, p2, x2, y2, w2, h2]

    preds = preds.reshape(-1, self.S, self.S, self.C + self.B * 5)
    box1 = preds[...]
    # 21:25 is the first box x1,y1,w1,h1
    iou_box1 = intersection_over_union(preds[..., 21:25], target[..., 21:25])
    # 26:30 is the 2nd box x2,y2,w2,h2
    iou_box2 = intersection_over_union(preds[..., 26:30], target[..., 21:25])

    #torch.squeeze: insert new dimension to that specify position
    ious = torch.cat([iou_box1.unsqueeze(0), iou_box2.unsqueeze(0)], dim = 0)
    iou_max, bestbox = torch.max(ious, dim = 0)
    #exists box, 1 or 0
    exists_box = target[..., 20].unsqueeze(3)
    #box coords

    box_preds = exists_box * (bestbox * (preds[..., 26:30]) + \
                              (1 - bestbox) * (preds[..., 21:25]))
    box_targets = exists_box * target[..., 21 : 25]

    #the 1e-6 is for numerical stability
    # torch.sign([-1,-0.5, 1, 0.5, 0]) --> [-1,-1, 1, 1, 0]
    box_preds[..., 2:4] = torch.sign(box_preds[..., 2:4]) * \
      torch.sqrt(torch.abs(box_preds[..., 2:4] + 1e-6))

    box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

    #Flatten the first 3 dims, input (N, S, S, 4) --> (N * S * S, 4)
    box_loss = self.mse(
         torch.flatten(box_preds, end_dim = -2),
         torch.flatten(box_targets, end_dim = -2),
    )

    #object loss
    # pred box shape = (N * S * S, 1)
    pred_box = (
        bestbox * preds[..., 25:26] + (1 - bestbox) * preds[..., 20:21]
    )

    object_loss = self.mse(
      torch.flatten(exists_box * pred_box),
      torch.flatten(exists_box * target[..., 20:21]),
    )

    #no object loss
    # (N,S,S,1) --> (N, S * S)
    noobject_loss = self.mse(
      torch.flatten((1 - exists_box) * preds[..., 20:21], start_dim=1),
      torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
    )
    noobject_loss += self.mse(
      torch.flatten((1 - exists_box) * preds[..., 25:26], start_dim=1),
      torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
    )

    #class loss
    # (N, S, S, 20) --> (N *S * S, 20)
    class_loss = self.mse(
      torch.flatten(exists_box * preds[..., :20], end_dim = -2),
      torch.flatten(exists_box * target[..., :20], end_dim = -2)
    )

    loss = (
      self.lambda_coord * box_loss\
      + object_loss
      + self.lambda_noobj * noobject_loss
      + class_loss
    )

    return loss



# Loading the dataset

In [20]:
class VOCDataset(Dataset):
    def __init__(self, csv_file, img_dir, label_dir, S = 7, B = 2, C = 20, transforms = None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.S = S
        self.B = B
        self.C = C
    
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index,1])
        print(label_path)

VOC_PATH = './data/'
VOC_IMAGE_PATH = './data/images/'
VOC_LABEL_PATH = './data/labels/'

TRAIN_VOC_CSV = os.path.join(VOC_PATH, 'train.csv')
voc = VOCDataset(TRAIN_VOC_CSV, VOC_IMAGE_PATH, VOC_LABEL_PATH)


In [21]:
print(voc[0])

./data/labels/000007.txt
None


In [27]:
# def IoU(box_preds, box_labels, box_format = "midpoint"):

#   if box_format == 'midpoint':
#     box1_x1 = box_preds[..., 0:1] - box_preds[..., 2:3] / 2
#     box1_y1 = box_preds[..., 1:2] - box_preds[..., 3:4] / 2
#     box1_x2 = box_preds[..., 0:1] + box_preds[..., 2:3] / 2
#     box1_y2 = box_preds[..., 1:2] + box_preds[..., 3:4] / 2

#     box2_x1 = box_labels[..., 0:1] - box_labels[..., 2:3] / 2
#     box2_y1 = box_labels[..., 1:2] - box_labels[..., 3:4] / 2
#     box2_x2 = box_labels[..., 0:1] + box_labels[..., 2:3] / 2
#     box2_y2 = box_labels[..., 1:2] + box_labels[..., 3:4] / 2
#   else:
#     box1_x1 = box_preds[..., 0:1]
#     box1_y1 = box_preds[..., 1:2]
#     box1_x2 = box_preds[..., 2:3]
#     box1_y2 = box_preds[..., 3:4]
#     box2_x1 = box_labels[..., 0:1]
#     box2_y1 = box_labels[..., 1:2]
#     box2_x2 = box_labels[..., 2:3]
#     box2_y2 = box_labels[..., 3:4]

#   x1 = torch.max(box1_x1, box2_x1)
#   y1 = torch.max(box1_y1, box2_y1)
#   x2 = torch.min(box1_x2, box2_x2)
#   y2 = torch.min(box2_y2, box2_y2)

#   #if x2 - x1 < 0 or y2 - y1 < 0, set intersection to 0
#   intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

#   box1_area = abs((box1_x2 - box1_x1) * (box1_y1 - box1_y2))
#   box2_area = abs((box2_x2 - box2_x1) * (box2_y1 - box2_y2))

#   return intersection / (box1_area + box2_area)


In [28]:
# def no_max_supression(bbox, iou_threshold, threshold, box_format = 'midpoint'):
#   #pred format (1, 0.9, x1, y1, x2, y2)
#   # where 1 is the 0,1 value,1 means there's an object in this bbox, else 0
#   # 0.9 is the probability of this box
#   assert type(bbox) == list
#   bbox = [box for box in bbox if box[1] > threshold]
#   bbox = sorted(bbox, key = lambda x: x[1], reverse = True)
#   bbox_after_nms = []

#   while bbox:
#     chosen_box = bbox.pop(0)

#     bbox = [
#         box for box in bbox if box[0] != chosen_box[0]\
#         or IoU(torch.tensor(chosen_box[2:]), torch.tensor(box[2:]), box_format = box_format) < iou_threshold
#     ]

#     bbox_after_nms.append(chosen_box)

#   return bbox_after_nms



In [31]:
# # Mean Average Precision as metric
# def mAP(pred_boxes, true_boxes, iou_threshold =0.5, box_format = 'midpoint', num_classes = 20):
#   average_precisions = []
#   #for numerical stability in float
#   epsilon = 1e-6

#   #pred boxes(list): [[train_idx, class_pred, prob_score, x1, y1, x2, y2], ....]
#   #same for true boxes
#   for c in range(num_classes):
#     detections = []
#     ground_truths = []

#     #only choose pred and true for that class
#     for pred_box in pred_boxes:
#       if pred_box[1] == c:
#         detections.append(pred_box)
#     for true_box in true_boxes:
#       if true_box[1] == c:
#         ground_truths.append(true_box)

#     #img0 has 3 bboxes
#     #img1 has 5 bboxes
#     #convert to dictionary: amount_bboxes = {0:3, 1:5}
#     amount_bboxes = Counter([gt[0] for gt in ground_truths])

#     #convert to dictionary with tensors:
#     #amount_bboxes = {0: tensor([0,0,0]), 1:tensor([0,0,0,0,0])}
#     #we're doing this because we gonna mark only 1 box in that image as true
#     #the other images are FP
#     for key, val in amount_bboxes.item():
#       amount_bboxes[key] = torch.zeros(val)

#     detections.sort(key = lambda x: x[2], reverse = True)
#     TP = torch.zeros(len(detections))
#     FP = torch.zeros(len(detections))

#     total_true_boxes = len(ground_truths)

#     for detection_idx, detection in enumerate(detections):
#       #get the ground truth with the same id with the detected box
#       ground_truth_img = [ bbox for bbox in ground_truths if bbox[0] == detection[0]]

#       num_ground_truths = len(ground_truth_img)
#       best_iou = 0
#       best_gt_idx = 0

#       for idx, gt in enumerate(ground_truth_img):
#         iou = IoU(torch.tensor(detection[3:]), torch.tensor(ground_truths[3:]), box_format= box_format)

#         if iou > best_iou:
#           best_iou = iou
#           best_gt_idx = idx

#       if best_iou > iou_threshold:
#         if amount_bboxes[detection[0]][best_gt_idx] == 0:
#           TP[detection_idx] = 1
#           amount_bboxes[detection[0]][best_gt_idx] = 1
#         else:
#           FP[detection_idx] = 1
#       else:
#           FP[detection_idx] = 1

#       # [1,1,0,1,1,0] --> [1,2,2,3,4,4]
#       TP_cumsum = torch.cumsum(TP, dim = 0)
#       FP_cumsum = torch.cumsum(FP, dim = 0)

#       recalls = TP_cumsum / (total_true_boxes + epsilon)
#       precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
#       precisions = torch.cat(torch.tensor([1]), precisions)

#       recalls = torch.cat((torch.tensor([0])), recalls)

#       average_precisions.append(torch.trapz(precisions, recalls))

#     return  sum(average_precisions) / len(average_precisions)


In [None]:
X = torch.randn((2,1470))
print(X.shape)
X = X.reshape(-1, 7,7, 30)
print(X.shape)

torch.Size([2, 1470])
torch.Size([2, 7, 7, 30])


In [None]:
a = torch.tensor([list(range(30)) for _ in range(5)])

print(a[..., 20].unsqueeze(2))


IndexError: ignored