In [2]:
!pip install xmltodict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


## Import Module

In [4]:
from torchvision.datasets import VOCDetection
from PIL import Image, ImageDraw, ImageFont
from torchvision.transforms.functional import to_tensor, to_pil_image
import torchvision.transforms as transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensor

### Convert xml files into List(Dict)

![](https://miro.medium.com/max/1400/1*wDgrfpkgh6wbLJQvHbefWQ.png)

In [None]:
def get_infos(annot_f=annot_f, mode='train'): 
    annot_dir = annot_f.format(mode) 
    result = [] 
    for ano in [pth.join(annot_dir, ano) for ano in os.listdir(annot_dir)]: 
        f = open(ano) # xml 파일 하나씩 읽어들임
        info = xmltodict.parse(f.read())['annotation'] 
        image_id = info['filename'] 
        image_size = np.asarray(tuple(map(int, info['size'].values()))[:2], np.int16) 
        w, h = image_size 
        box_objects = info['object'] 
        labels = [] 
        bboxs = [] 
        for obj in box_objects: 
            try: 
                labels.append(classes.index(obj['name'].lower())) # 0~19 사이
                bboxs.append(tuple(map(int, obj['bndbox'].values()))) 
            except: pass 
        # Resizing Box, Change  [x1, y1, x2, y2] 
        # albumentations (normalized box) 
        bboxs = np.asarray(bboxs, dtype=np.float64) 
        try: 
            bboxs[:, [0,2]] /= w # width
            bboxs[:, [1,3]] /= h # height
        except: pass 
        if bboxs.shape[0] or mode=='test': 
            result.append({'image_id':image_id, 'image_size':image_size, 'bboxs':bboxs, 'labels':labels}) 
    return result 
    # box = [x_centre, y_centre, width, height]
    # label = [c1, c2, c3, ..., c20, pc1, x, y, w, h]
    
trval_list = get_infos() 
test_list = get_infos(mode='test') 

len(trval_list), len(test_list)

### Split Train Data

In [None]:
def get_tv_idx(tl, k=0.5):
    total_idx = range(tl) # (0,3067)
    train_idx = sample(total_idx, int(tl*k)) # 3067/2 개만큼 뽑음
    valid_idx = set(total_idx) - set(train_idx) # 안뽑힌 index들이 valid_idx
    return train_idx, list(valid_idx)

train_idx, valid_idx = get_tv_idx(len(trval_list))

trval_list = np.asarray(trval_list)  # list -> array
train_list = trval_list[train_idx]
valid_list = trval_list[valid_idx]

len(train_list), len(valid_list), len(test_list)

### Dataset

In [None]:
import torch
import os
import pandas as pd
from PIL import Image


class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None):
        # csv파일에 대하여 path와 image, label path도 기입하여 초기 parameter 설정
        # S(grid) -> 7 / B(numbers of box) -> 2 / C(numbers of class) -> 20
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.S = S
        self.B = B
        self.C = C

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
        boxes = []
        with open(label_path) as f:
            for label in f.readlines():
                class_label, x, y, width, height = [
                    float(x) if float(x) != int(float(x)) else int(x)
                    for x in label.replace("\n", "").split()
                ]

                boxes.append([class_label, x, y, width, height])
        # label index에 맞는 txt 파일을 읽어가면서 [class_label, x좌표, y좌표, width, height]를 추가 

        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
        image = Image.open(img_path)

        ### if data augmentation ### 
        boxes = torch.tensor(boxes)  # transform을 하게 되면 augmentation에서 boxex가 필요, 안할 경우 필요없는 작업

        if self.transform:
            # data augmentation하게 되면 좌표도 같이 수정해야되서 boxes를 입력 받음
            image, boxes = self.transform(image, boxes)
        
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B)) 
        ### label_matrix의 shape = [7x7x30]

        ### boxes list에 저장한 list를 하나씩 불러오며 좌표계에 따른 상대적인 크기를 재설정하고 label matrix에 맞게 저장
        for box in boxes:
            class_label, x, y, width, height = box.tolist()
            class_label = int(class_label)
            ## i,j는 S*S의 하나의 grid cell 중 어디에 들어있는지 판별해주는 숫자
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i
            width_cell, height_cell = (
                width * self.S,
                height * self.S
            )
            ## 만약 cell i,j에 object가 없다면
            if label_matrix[i, j, 20] == 0:
              ## cell하 하나에 object가 있게 강제 설정?
                label_matrix[i, j, 20] = 1  # obj면 1

                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )
                label_matrix[i, j, 21:25] = box_coordinates  # 좌표 입력
                label_matrix[i, j, class_label] = 1  # class 나타내는 20길이 벡터에 해당 클래스만 1 할당 -> one-hot encoding
        
        ### box의 변수에는 전체 ground truth box의 [x,y,w,h]가 저장
        ### 각각의 ground truth box를 순회하면서 ground truth box의 중심 좌표를 계산한 후 label matrix에 confidence score와 bounding box의 좌표를 저장
      
        return image, label_matrix

![](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2F5f0uM%2FbtqVuhTDtaq%2FQGkjKmvPpOBdwxTLDyHuGK%2Fimg.png)

In [None]:
import torch
import torch.nn as nn

architecture_config = [
    # tuple = (kernel size, number of filters of output, stride, padding)
    (7, 64, 2, 3),
    "M",  # max-pooling 2x2 stride = 2
    (3, 192, 1, 1),
    "M",  # max-pooling 2x2 stride = 2
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",  # max-pooling 2x2 stride = 2
    # [tuple, tuple, repeat times]
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",  # max-pooling 2x2 stride = 2
    # [tuple, tuple, repeat times]
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]


class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        x = self.conv(x)
        x = self.batchnorm(x)
        x = self.leakyrelu(x)
        return x


class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        x = torch.flatten(x, start_dim=1)
        x = self.fcs(x)
        return x

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [CNNBlock(in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],)]

                in_channels = x[1]

            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]

            elif type(x) == list:
                conv1 = x[0]  # tuple
                conv2 = x[1]  # tuple
                num_repeats = x[2]  # integer

                for _ in range(num_repeats): # [tuple, tuple, repeat times]
                                             # [(1, 256, 1, 0), (3, 512, 1, 1), 4],
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3],
                        )]

                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3],
                        )
                    ]

                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024*S*S, 4096),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S*S*(C+B*5)),  # (S,S,30)
        )


def test(S=7, B=2, C=20):
    model = Yolov1(split_size=S, num_boxes=B, num_classes=C)
    x = torch.randn((2, 3, 448, 448))
    print(model(x).shape)


if __name__ == "__main__":
    test()

![](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FwNXOK%2FbtqSpGVHmHc%2FKbsxRBSs6KymYB3PkEny21%2Fimg.png)

- area of overlab : predicted bounding box와 ground-truth bounding box가 겹치는 영역
- area of union : predicted bounding box와 ground-truth bounding box를 둘러싸는 영역

### 왜 IOU를 사용?
- 실제로 predicted bounding box가 정확히 ground-truth bounding box와 일치하는 경우는 존재하지 않는다. 이 때문에 predicted bounding box가 ground-truth bounding box와 얼마나 일치하는지 측정하는 지표



In [None]:
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    
    ### box의 좌표는 mid_point와 corner 두가지로 나눠져 있음
    ### box의 shape : (box의 개수,4)
    

    if box_format == "midpoint":
        ### 겹치는 area의 좌표를 구하기 위해 각 좌표의 차이 /2
        
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2

        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2

        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2

        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2

        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2

        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2

        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2

        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    # intersection 좌표 구하기
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # .clamp(0) is for the case when they do not intersect
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

![](https://velog.velcdn.com/images%2Fkimkj38%2Fpost%2F95459937-5bd9-4088-84db-59f13233e70a%2Fimage.png)

1. localization error의 영향력을 높이기 위해 5의 가중치를 준다.
2. i번째 셀의 j번째 bounding box가 responsible box일 때만 x, y좌표에 대한 error를 계산한다.
3. 배경 class의 경우 학습에 영향을 덜 미치도록 가중치를 0.5로 설정해준다.
4. i번째 셀, j번째 bouding box가 배경일 경우에만 confidence error를 계산한다.
5. 객체를 포함한 bounding box에 대한 confidence error.
6. bouding box와 관계없이 각 셀마다 클래스를 분류하기 위한 오차.

In [None]:
import torch
import torch.nn as nn
from utils import intersection_over_union


class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        ### loss function을 class로 정의하여 grid의 크기 S, grid cell별로 예측 bounding box의 수, 예측하는 class의 수 C를 정의
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C
        ### 배경에 대한 일종의 penalty weight
        self.lambda_noobj = 0.5
        ### object에 대한 weight
        self.lambda_coord = 5

    ### --- 우선 각 grid cell마다 2개의 bounding box를 예측하고 그 중 confi-score가 높은 1개의 bounding box를 학습에 사용 --- 

    def forward(self, predictions, target):
        ### input shape (Batch_size, S*S(C+B*5))
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B*5)

        ### Calculate IOU for two predicted bounding boxex 

        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25]) 
        # prediction, target[21:25] -> 첫번째 bounding box의 좌표값

        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 26:30])
        # prediction, target[26:30] -> 두번째 bounding box의 좌표값

        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
        ### bestbox를 이용해서 2개의 bbox중 iou가 높은 것을 선택한다.

        iou_maxes, bestbox = torch.max(ious, dim=0)
        ### Iobj_i -> 객체 존재 1, 객체 존재 안하면 0

        exists_box = target[..., 20].unsqueeze(3)
        ### target[..., 20]을 통해 해당 grid cell에 ground truth box의 중심이 존재하는지 여부를 확인


        ### ---- Localization loss  ---- ###

        ### Set boxes with no object in them to 0
        ### predictions -> highest IOU

        box_predictions = exists_box * (
            (bestbox * predictions[..., 26:30]+ (1-bestbox) * predictions[..., 21:25])
        )
        ### bestbox는 1이거나 0이기 때문에 bestbox와 (1-bestbox)를 두면서 스위치 역할?
        ### predictions [26:30] -> bounding box의 x, y, width, height

        box_targets = exists_box * target[..., 21:25]
        # target[21:25]는 첫번째 ground truth bounding box의 좌표값
        
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6) 
        )
        ### box_predictions[2:4] -> width, height에 sqrt

        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])
        ### box_predictions[2:4] -> width, height에 sqrt

        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2)
        )
        ### bestbox와 exist_box 변수를 사용하여 bounding box prediction 중 IOU 값이 더 큰 box를 최종 
        ### box_predictions를 사용



        ### --- Confidence LOSS --- ### 


        pred_box = (
            bestbox * predictions[..., 25:26] + (1-bestbox) * predictions[..., 20:21]
        )
        ### predictions[25:26] -> 첫번째 bounding box의 confidence score
        ### predictions[20:21] -> 두번째 bounding box의 cofidence score

        # (N*S*S)
        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21] * iou_maxes)  # confidence score를 이용해 loss 계산하기 위해 iou_maxes를 곱해준다.
        )
        ### object가 실제로 존재할 때 cofidence loss
        ###  flatten 전: (N, S, S, 1) -> flatten 후: (N, S*S)
        ### exists_box의 변수를 통해 grid cell에 할당 된 ground truth box의 중심이 존재하는 경우만 loss를 구함
        ### prediction confidence loss와 target confidence의 MSE를 구해줌


        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )

        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )
        ### object가 존재하지 않을 때, confidence loss를 구함, 이때는 두개의 bounding box를 활용


        ### ---  CLASS LOSS --- ###


    
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
            torch.flatten(exists_box * target[..., :20], end_dim=-2)
        )
        ### 20개의 class score와 target을 통해 MSE loss를 구함
        ### flatten 전: (N,S,S,20) -> flatten 후: (N*S*S, 20)
        
        ### --- Final Loss --- ###


        loss = (
            self.lambda_coord * box_loss  
            + object_loss
            + self.lambda_noobj * no_object_loss
            + class_loss
        )

        ### 가중치 파라미터 lambda = 5 값을 localization loss에 곱해줌
        ### 가중치 파라미터 lambda = 0.5 값을 no object confidence loss에 곱해줌

        ### final loss = local_loss + confi_loss + class_loss ###

        return loss