[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/CV/blob/main/object_detection/yolo_version1/runner.ipynb)

In [15]:
# !pip install tqdm==4.66.4
# !pip install torchvision==0.18.1
# !pip install torch==2.3.1
# !pip install albumentations==1.4.13
# !pip install Pillow==10.4.0
# !pip install opencv_python==4.10.0.84
# !pip install einops==0.8.0
#!pip install opencv-python

In [16]:
import argparse
import os
import random
from tqdm import tqdm
import numpy as np
import yaml
import csv
import os
import albumentations as albu
import cv2
import xml.etree.ElementTree as ET

import torch
from torch.utils.data.dataloader import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data.dataset import Dataset
import torch.nn as nn
import torchvision

In [17]:
seed = 1111
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device == 'cuda':
    torch.cuda.manual_seed_all(seed)

In [5]:
!git clone https://github.com/khetansarvesh/CV.git

Cloning into 'CV'...
remote: Enumerating objects: 841, done.[K
remote: Counting objects: 100% (306/306), done.[K
remote: Compressing objects: 100% (141/141), done.[K
remote: Total 841 (delta 158), reused 279 (delta 140), pack-reused 535 (from 1)[K
Receiving objects: 100% (841/841), 28.54 MiB | 25.93 MiB/s, done.
Resolving deltas: 100% (455/455), done.


# **Dataset**

In [6]:
# GETTING VOC2007 TRAIN DATASET and EXTRACTING TAR FILES
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
!tar xf VOCtrainval_06-Nov-2007.tar

# GETTING VOC2012 TRAIN DATASET and EXTRACTING TAR FILES
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
!tar xf VOCtrainval_11-May-2012.tar

# GETTING VOC2007 TEST DATASET and EXTRACTING TAR FILES
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar #
!tar xf VOCtest_06-Nov-2007.tar

## Gettting the images location for 2007 (both train and test) and 2012 (only train) dataset in txt files
# !wget https://pjreddie.com/media/files/voc_label.py
# !python voc_label.py

--2024-11-05 19:51:12--  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460032000 (439M) [application/x-tar]
Saving to: ‘VOCtrainval_06-Nov-2007.tar’


2024-11-05 19:51:24 (36.6 MB/s) - ‘VOCtrainval_06-Nov-2007.tar’ saved [460032000/460032000]

--2024-11-05 19:51:26--  http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1999639040 (1.9G) [application/x-tar]
Saving to: ‘VOCtrainval_11-May-2012.tar’


2024-11-05 19:52:25 (32.6 MB/s) - ‘VOCtrainval_11-May-2012.tar’ saved [1999639040/1999639040]

--2024-11-05 19:52:

In [None]:
def load_images_and_anns(label2idx, ann_fname, split):
    r"""
    Method to get the xml files and for each file get all the objects and their ground truth detection information for the dataset
    """
    im_infos = []
    ims = []
    im_sets = ['VOCdevkit/VOC2007', 'VOCdevkit/VOC2012'] #since we will be using VOC dataset

    for im_set in im_sets:
        im_names = []

        # Fetch all image names in txt file for this imageset
        for line in open(os.path.join(im_set, 'ImageSets', 'Main', f'{ann_fname}.txt')):
            im_names.append(line.strip())

        # Set annotation (or label) and image path
        ann_dir = os.path.join(im_set, 'Annotations')
        im_dir = os.path.join(im_set, 'JPEGImages')

        # iterate over all the images
        for im_name in im_names:

            # getting some meta data for that image
            ann_file = os.path.join(ann_dir, f'{im_name}.xml')
            im_info = {}
            ann_info = ET.parse(ann_file)
            root = ann_info.getroot()
            size = root.find('size')
            width = int(size.find('width').text)
            height = int(size.find('height').text)
            im_info['img_id'] = os.path.basename(ann_file).split('.xml')[0]
            im_info['filename'] = os.path.join(im_dir, '{}.jpg'.format(im_info['img_id']))
            im_info['width'] = width
            im_info['height'] = height
            detections = []

            # We will keep an image only if there are valid rois in it
            any_valid_object = False

            # getting all the rois for this image
            for obj in ann_info.findall('object'):
                det = {}
                label = label2idx[obj.find('name').text]
                difficult = int(obj.find('difficult').text)
                bbox_info = obj.find('bndbox')
                bbox = [
                    int(float(bbox_info.find('xmin').text))-1,
                    int(float(bbox_info.find('ymin').text))-1,
                    int(float(bbox_info.find('xmax').text))-1,
                    int(float(bbox_info.find('ymax').text))-1
                ]
                det['label'] = label
                det['bbox'] = bbox
                det['difficult'] = difficult
                # Ignore difficult rois during training
                # At test time eval does the job of ignoring difficult
                # examples.
                if difficult == 0 or split == 'test':
                    detections.append(det)
                    any_valid_object = True

            if any_valid_object:
                im_info['detections'] = detections
                im_infos.append(im_info)

    print(f'Total {len(im_infos)} images found')
    print(f'Example Information : {im_infos[0]}')
    return im_infos

In [None]:
class VOCDataset(Dataset):
    def __init__(self, split):

        # to decide if we are working with training data or testing data
        self.split = split
        self.fname = 'trainval' if self.split == 'train' else 'test'

        self.im_size = 448 # img size is 448*448
        self.S = 7 # 7*7 grid size
        self.B = 2 # no of bounding box prediction per grid

        self.C = 20 # no of classes
        classes = ['person', 'bird', 'cat', 'cow', 'dog', 'horse', 'sheep',
            'aeroplane', 'bicycle', 'boat', 'bus', 'car', 'motorbike', 'train',
            'bottle', 'chair', 'diningtable', 'pottedplant', 'sofa', 'tvmonitor'] # defining all the 20 classes present in the VOC dataset
        classes = sorted(classes)

        # creating a dictionary to map class name to index eg 'person' : 0
        self.label2idx = {classes[idx]: idx for idx in range(len(classes))}

        # creating a vice versa dictionary i.e. 0 : 'person'
        self.idx2label = {idx: classes[idx] for idx in range(len(classes))}

        # getting the VOC Dataset images
        self.images_info = load_images_and_anns(self.label2idx, self.fname, self.split)

    def __len__(self):
        return len(self.images_info)
    
    '''Here in this method we will convert bounding boxes from absoulute coordinates to grid 
    relative coordinates'''
    def __getitem__(self, index):

        # getting the image
        im_info = self.images_info[index]
        im = cv2.imread(im_info['filename'])
        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)

        # Get label and bounding box information for this image
        bboxes = [detection['bbox'] for detection in im_info['detections']]
        labels = [detection['label'] for detection in im_info['detections']]
        difficult = [detection['difficult'] for detection in im_info['detections']]

        # Transform Image and label => hence you will also have to transform bounding boxes
        transforms = {

            # performing transformations on train dataset
            'train': albu.Compose([
                albu.HorizontalFlip(p=0.5),
                albu.Affine(
                    scale=(0.8, 1.2),
                    translate_percent=(-0.2, 0.2),
                    always_apply=True
                ),
                albu.ColorJitter(
                    brightness=(0.8, 1.2),
                    contrast=(0.8, 1.2),
                    saturation=(0.8, 1.2),
                    hue=(-0.2, 0.2),
                    always_apply=None,
                    p=0.5,
                ),
                albu.Resize(self.im_size, self.im_size)],
                bbox_params=albu.BboxParams(format='pascal_voc',
                                            label_fields=['labels'])),


            # performing transformations on test datset
            'test': albu.Compose([
                albu.Resize(self.im_size, self.im_size),
                ],
                bbox_params=albu.BboxParams(format='pascal_voc',
                                            label_fields=['labels']))
        }
        transformed_info = transforms[self.split](image=im, bboxes=bboxes, labels=labels)
        im = transformed_info['image']
        bboxes = torch.as_tensor(transformed_info['bboxes'])
        labels = torch.as_tensor(transformed_info['labels'])
        difficult = torch.as_tensor(difficult)

        # Convert image to tensor and normalize (since we will use resnet backbone, it expects input in this format)
        im_tensor = torch.from_numpy(im / 255.).permute((2, 0, 1)).float()
        im_tensor_channel_0 = (torch.unsqueeze(im_tensor[0], 0) - 0.485) / 0.229
        im_tensor_channel_1 = (torch.unsqueeze(im_tensor[1], 0) - 0.456) / 0.224
        im_tensor_channel_2 = (torch.unsqueeze(im_tensor[2], 0) - 0.406) / 0.225
        im_tensor = torch.cat((im_tensor_channel_0,
                               im_tensor_channel_1,
                               im_tensor_channel_2), 0)
        bboxes_tensor = torch.as_tensor(bboxes)
        labels_tensor = torch.as_tensor(labels)

        '''Build Target for Yolo'''

        target_dim = 5 * self.B + self.C
        h, w = im.shape[:2]
        yolo_targets = torch.zeros(self.S, self.S, target_dim)

        # Height and width of grid cells is H // S
        cell_pixels = h // self.S

        if len(bboxes) > 0:

            # VOC dataset has (x1, y1, x2, y2) format of bounding box so we need to convert it into (Xcenter, Ycenter, width, heigh) format
            box_widths = bboxes_tensor[:, 2] - bboxes_tensor[:, 0]
            box_heights = bboxes_tensor[:, 3] - bboxes_tensor[:, 1]
            box_center_x = bboxes_tensor[:, 0] + 0.5 * box_widths
            box_center_y = bboxes_tensor[:, 1] + 0.5 * box_heights

            # Get cell i,j from xc, yc
            box_i = torch.floor(box_center_x / cell_pixels).long()
            box_j = torch.floor(box_center_y / cell_pixels).long()

            # Xcenter offset from cell topleft
            box_xc_cell_offset = (box_center_x - box_i*cell_pixels) / cell_pixels
            box_yc_cell_offset = (box_center_y - box_j*cell_pixels) / cell_pixels

            # w, h targets normalized to 0-1
            box_w_label = box_widths / w
            box_h_label = box_heights / h

            # Update the target array for all bboxes
            for idx, b in enumerate(range(bboxes_tensor.size(0))):
                # Make target of the exact same shape as prediction
                for k in range(self.B):
                    s = 5 * k
                    # target_ij = [xc_offset,yc_offset,sqrt(w),sqrt(h), conf, cls_label]
                    yolo_targets[box_j[idx], box_i[idx], s] = box_xc_cell_offset[idx]
                    yolo_targets[box_j[idx], box_i[idx], s+1] = box_yc_cell_offset[idx]
                    yolo_targets[box_j[idx], box_i[idx], s+2] = box_w_label[idx].sqrt()
                    yolo_targets[box_j[idx], box_i[idx], s+3] = box_h_label[idx].sqrt()
                    yolo_targets[box_j[idx], box_i[idx], s+4] = 1.0
                label = int(labels[b])
                cls_target = torch.zeros((self.C,))
                cls_target[label] = 1.
                yolo_targets[box_j[idx], box_i[idx], 5 * self.B:] = cls_target
        # For training, we use yolo_targets(xoffset, yoffset, sqrt(w), sqrt(h))
        # For evaluation we use bboxes_tensor (x1, y1, x2, y2)
        # Below we normalize bboxes tensor to be between 0-1
        # as thats what evaluation script expects so (x1/w, y1/h, x2/w, y2/h)
        if len(bboxes) > 0:
            bboxes_tensor /= torch.Tensor([[w, h, w, h]]).expand_as(bboxes_tensor)
        targets = {
            'bboxes': bboxes_tensor,
            'labels': labels_tensor,
            'yolo_targets': yolo_targets,
            'difficult': difficult,
        }
        return im_tensor, targets, im_info['filename']


In [20]:
def collate_function(data):
    return list(zip(*data))

voc = VOCDataset('train')
train_dataset = DataLoader(voc, batch_size=64, shuffle=True, collate_fn = collate_function)

Total 16551 images found
Example Information : {'img_id': '000005', 'filename': 'VOCdevkit/VOC2007/JPEGImages/000005.jpg', 'width': 500, 'height': 375, 'detections': [{'label': 8, 'bbox': [262, 210, 323, 338], 'difficult': 0}, {'label': 8, 'bbox': [164, 263, 252, 371], 'difficult': 0}, {'label': 8, 'bbox': [240, 193, 294, 298], 'difficult': 0}]}


# **Modelling**

In [21]:
class YOLOV1(nn.Module):

    def __init__(self):
        super(YOLOV1, self).__init__()

        backbone = torchvision.models.resnet34(weights=torchvision.models.ResNet34_Weights.IMAGENET1K_V1)
        self.features = nn.Sequential(
            backbone.conv1,
            backbone.bn1,
            backbone.relu,
            backbone.maxpool,
            backbone.layer1,
            backbone.layer2,
            backbone.layer3,
            backbone.layer4,
        )


        self.conv_yolo_layers = nn.Sequential(
            # convolution layer 1
            nn.Conv2d(512, 1024, 3, padding=1, bias=False), #512 cause resnet gives 512 channel output
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),

            # convolution layer 2
            nn.Conv2d(1024, 1024, 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),

            # convolution layer 3
            nn.Conv2d(1024, 1024, 3, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),

            # convolution layer 4
            nn.Conv2d(1024, 1024, 3, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
            )

        self.fc_yolo_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(7 * 7 * 1024, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),
            nn.Linear(4096, 7 * 7 * (5 * 2 + 20)),
        )
        # instead of this you could have also used a 1*1 convolution layer as follows
        # self.fc_yolo_layers = nn.Sequential( nn.Conv2d(1024, 5 * self.B + self.C, 1))

    def forward(self, x):
        out = self.features(x)
        out = self.conv_yolo_layers(out)
        out = self.fc_yolo_layers(out)
        return out

In [None]:
yolo_model = YOLOV1().to(device)

YOLOV1(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

# **Training**

In [24]:
# set the model to training setting
yolo_model.train()

# defining optimizer
optimizer = torch.optim.SGD(lr=0.001,
                            params=filter(lambda p: p.requires_grad,yolo_model.parameters()),
                            weight_decay=5E-4,
                            momentum=0.9)

# defining scheduler
scheduler = MultiStepLR(optimizer, milestones=[ 50, 75, 100, 125 ], gamma=0.5)


# defining loss function
from CV.object_detection.yolo_version1.loss import YOLOV1Loss
criterion = YOLOV1Loss()


# other training parameters
acc_steps = 1
num_epochs = 135
steps = 0

In [None]:
for epoch_idx in range(num_epochs):
    losses = []
    optimizer.zero_grad()

    for idx, (ims, targets, _) in enumerate(tqdm(train_dataset)):
        yolo_targets = torch.cat([
            target['yolo_targets'].unsqueeze(0).float().to(device)
            for target in targets], dim=0)
        im = torch.cat([im.unsqueeze(0).float().to(device) for im in ims], dim=0)
        yolo_preds = yolo_model(im)
        loss = criterion(yolo_preds, yolo_targets)
        loss = loss / acc_steps
        loss.backward()
        losses.append(loss.item())

        if (idx + 1) % acc_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if steps % 100 == 0:
            print('Loss : {:.4f}'.format(np.mean(losses)))

        if torch.isnan(loss):
            print('Loss is becoming nan. Exiting')
            exit(0)
        steps += 1

    print('Finished epoch {}'.format(epoch_idx+1))
    optimizer.step()
    optimizer.zero_grad()
    scheduler.step()
    #torch.save(yolo_model.state_dict(), os.path.join(train_config['task_name'], train_config['ckpt_name']))

  0%|          | 1/259 [03:03<13:09:50, 183.68s/it]

Loss : 19.0473


  0%|          | 1/259 [03:58<17:04:33, 238.27s/it]


KeyboardInterrupt: 

# **Inference**

In [None]:
dataset_config = {
                    'test_im_sets': ['data/VOC2007-test'],
                    'num_classes' : 20,
                    'im_size' : 448
                    }

train_config = {
                'task_name': 'voc',
                'acc_steps': 1, # increase if you want to get gradients from >1 steps(kind of mimicking >1 batch size)
                'log_steps': 100,
                'num_epochs': 135,
                'batch_size': 64,
                'lr_steps': ,
                'lr': 0.001,
                'infer_conf_threshold' : 0.2,
                'eval_conf_threshold' : 0.001,
                'nms_threshold' : 0.5,
                'ckpt_name': 'yolo_voc2007.pth'
                }

In [None]:
from CV.object_detection.yolo_version1.infer import infer, evaluate_map

In [None]:
infer(args)

In [None]:
evaluate_map(args)