In [1]:
import os
import cv2
import torch
import torchvision
import numpy as np
import pandas as pd
import torch.nn as nn
import albumentations as A
import torch.optim as optim
import matplotlib.pyplot as plt

from glob import glob
from tqdm import tqdm
from termcolor import colored
from matplotlib import patches
from collections import Counter
from torchvision import transforms
from torch.utils.data import DataLoader
from albumentations.pytorch import ToTensorV2

In [None]:
def convert_to_yolo_format(target, img_width, img_height, class_mapping):
    annotations = target["annotation"]["object"]
    
    real_width = int(target["annotation"]["size"]["width"])
    real_height = int(target["annotation"]["size"]["height"])
    
    if not isinstance(annotations, list):
        annotations = [annotations]
        
    boxes = []
    
    for anno in annotations:
        xmin = int(anno["bndbox"]["xmin"]) / real_width
        xmax = int(anno["bndbox"]["xmax"]) / real_height
        ymin = int(anno["bndbox"]["ymin"]) / real_width
        ymax = int(anno["bndbox"]["ymax"]) / real_height
        
        x_center = (xmin + xmax) / 2
        y_center = (ymin + ymax) / 2
        
        width = xmax - xmin
        height = ymax - ymin
        
        class_name = anno["name"]
        class_id = class_mapping[class_name] if class_name in class_mapping else 0
        
        boxes.append([class_id, x_center, y_center, width, height])
        
    return np.array(boxes)

In [None]:
class CustomVOCDataset(torchvision.datasets.VOCDetection):
    def init_config_yolo(self, class_mapping, S=7, B=2, C=20, custom_transforms=None):
        self.S = S
        self.B = B
        self.C = C
        self.class_mapping = class_mapping
        self.custom_transforms = custom_transforms
        
    def __getitem__(self, index):
        image, target = super(CustomVOCDataset, self).__getitem__(index)
        img_width, img_height = image.size
        
        boxes = convert_to_yolo_format(target, img_width, img_height, self.class_mapping)
        just_boxes = boxes[:, 1:]
        labels = boxes[:, 0]
        
        if self.custom_transforms:
            samples = {
                'image': np.array(image),
                'bboxes': just_boxes,
                'labels': labels
            }
            
            sample = self.custom_transforms(**sample)
            image = sample['image']
            boxes = sample['bboxes']
            labels = sample['labels']
            
        label_matrix = torch.zeros((self.S, self.S, self.C+5*self.B))
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.float32)
        image = torch.as_tensor(image, dtype=torch.float32)
        
        for box, class_label in zip(boxes, labels):
            x, y, width, height = box.tolist()
            class_label = int(class_label)
            
            i, j = int(self.S*y), int(self.S*x)
            x_cell, y_Cell = self.S *x - j, self.S * y - i
            
            width_cell, height_cell = (
                width * self.S,
                height * self.S,
            )
            
            if label_matrix[i, j, 20] == 0:
                label_matrix[i, j, 20] = 1
                
                box_coordinates = torch.tensor([x_cell, y_Cell, width_cell, height_cell])
                
                label_matrix[i, j, 21:25] = box_coordinates
                
                label_matrix[i, j, class_label] = 1
                
        return image, label_matrix

In [None]:
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2
        
    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]
        
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]
        
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.max(box1_x2, box2_x2)
    y2 = torch.max(box1_y2, box2_y2)
    
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
    
    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))
    
    return intersection / (box1_area + box2_area - intersection + 1e-6)

In [None]:
def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
    assert type(bboxes) == list
    
    bboxes = [box for box in bboxes if box[1] > threshold]
    
    bboxes = sorted(bboxes, key=lambda x : x[1], reverse=True)
    
    bboxes_after_nms = []
    
    while bboxes:
        chosen_box = bboxes.pop(0)
        
        bboxes = [
            box for box in bboxes 
            if box[0] != chosen_box[0]
            or intersection_over_union(
                torch.tensor(chosen_box[2:]),
                torch.tensor(box[2:]),
                box_format=box_format
            ) < iou_threshold
        ]
        
        bboxes_after_nms.append(chosen_box)
        
    return bboxes_after_nms

In [None]:
def mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20);
    average_precisions = []
    
    epsilon = 1e-6
    
    for c in range(num_classes):
        detections = []
        ground_truths = []
        
        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)
                
        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)
                
        amount_boxes = Counter([gt[0] for gt in ground_truths])
        
        for key, val in amount_boxes.items():
            amount_boxes[key] = torch.zeros(val)
            
        detections.sort(key=lambda x:x[2], reverse=True)
        TP = torch.zeros(len(detections))
        FP = torch.zeros(len(detections))
        total_true_bboxes = len(ground_truths)
        
        if total_true_bboxes == 0:
            continue
        
        for detection_idx, detection in enumerate(detections):
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]
            
            num_gts = len(ground_truth_img)
            best_iou = 0
            
            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format
                )
                
                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx
                    
            if best_iou > iou_threshold:
                if amount_boxes[detection[0]][best_gt_idx] == 0:
                    TP[detection_idx] = 1
                    amount_boxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1
                    
            else:
                FP[detection_idx] = 1
                
        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor[0], precisions))
        recalls = torch.cat((torch.tensor(0), recalls))
        
        average_precisions.append(torch.trapz(precisions, recalls))
        
    return sum(average_precisions) / len(average_precisions)

In [None]:
architecture_config = [
    (7, 64, 2, 3),   # Convolutional block 1
    "M",             # Max - pooling layer 1
    (3, 192, 1, 1),  # Convolutional block 2
    "M",             # Max - pooling layer 2
    (1, 128, 1, 0),  # Convolutional block 3
    (3, 256, 1, 1),  # Convolutional block 4
    (1, 256, 1, 0),  # Convolutional block 5
    (3, 512, 1, 1),  # Convolutional block 6
    "M",             # Max - pooling layer 3
    [(1, 256, 1, 0), (3, 512, 1, 1), 4], # Convolutional block 7 (repeated 4 times)
    (1, 512, 1, 0),  # Convolutional block 8
    (3, 1024, 1, 1), # Convolutional block 9
    "M",             # Max - pooling layer 4
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2], # Convolutional block 10 (repeated 2 times)
    (3, 1024, 1, 1)  # Convolutional block 11
    (3, 1024, 2, 1)  # Convolutional block 12
    (3, 1024, 1, 1)  # Convolutional block 13
    (3, 1024, 1, 1)  # Convolutional block 14
]

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, bias=False, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)
        
    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

In [None]:
class YOLOv1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(YOLOv1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)
    
    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels
        
        for x in architecture:
            if type(x) == tuple:
                layers += [
                    CNNBlock(in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3])
                ]
                
                in_channels = x[1]
                
            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2,2), stride=(2, 2))]
            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]
                
                for _ in range(num_repeats):
                    layers += [
                        CNNBlock(
                            in_channels, conv1[1], 
                            kernel_size=conv1[0], 
                            stride = conv1[2],
                            padding = conv1[3]
                        )
                    ]
                    
                    layers += [
                        CNNBlock(
                            conv1[1], conv2[1], 
                            kernel_size=conv2[0], 
                            stride = conv2[2],
                            padding = conv2[3]
                        )
                    ]
                    
                    in_channels = conv2[1]
        
        return nn.Sequential(*layers)
    
    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes
        
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024*S*S, 4096),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S*S*(C+B*5))
        )

In [None]:
class YOLOLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YOLOLoss).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S # Grid size of the image (7)
        self.B = B # Number of bounding boxes (2)
        self.C = C # Number of classes (in VOC Dataset, it's 20)
        
        self.lambda_noobj = 0.5
        self.lambda_coord = 5
        
    def forward(self, predictions, target):
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B*5)
        
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
        
        iou_maxes, bestbox = torch.max(ious, dim=0)
        exists_box = target[..., 20].unsqueeze(3)
        
        # LOSS FOR BOX COORDINATES
        box_predictions = exists_box * (
            (
                bestbox * predictions[..., 26:30] + (1 - bestbox)*predictions[..., 21:25]
            )
        )
        box_targets = exists_box * target[..., 21:25]
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[...,2:4] + 1e-6)
        )
        
        box_targets = exists_box * target[..., 21:25]
        
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])
        
        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2)
        )
        
        # LOSS FOR OBJECT 
        pred_box = (
            bestbox * predictions[..., 25:26] + (1 - bestbox) * predictions[..., 20:21]
        )
        
        object_loss = self.mse(
            torch.flatten(exists_box*pred_box),
            torch.flatten(exists_box*target[..., 20:21])
        )
        
        # LOSS FOR NO OBJECT
        