[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/CV/blob/main/object_detection/yolo_version1/runner.ipynb)

In [2]:
# !pip install tqdm==4.66.4
# !pip install torchvision==0.18.1
# !pip install torch==2.3.1
# !pip install albumentations==1.4.13
# !pip install Pillow==10.4.0
# !pip install opencv_python==4.10.0.84
# !pip install einops==0.8.0

In [3]:
import argparse
import os
import random
from tqdm import tqdm
import numpy as np 
import yaml
import csv


import torch
from torch.utils.data.dataloader import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
import torch.nn as nn
import torchvision

In [4]:
dataset_config = {
                    'test_im_sets': ['data/VOC2007-test'],
                    'num_classes' : 20,
                    'im_size' : 448
                    }

train_config = {
                'task_name': 'voc',
                'seed': 1111,
                'acc_steps': 1, # increase if you want to get gradients from >1 steps(kind of mimicking >1 batch size)
                'log_steps': 100,
                'num_epochs': 135,
                'batch_size': 64,
                'lr_steps': [ 50, 75, 100, 125 ],
                'lr': 0.001,
                'infer_conf_threshold' : 0.2,
                'eval_conf_threshold' : 0.001,
                'nms_threshold' : 0.5,
                'ckpt_name': 'yolo_voc2007.pth'
                }

In [5]:
seed = train_config['seed']
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device == 'cuda':
    torch.cuda.manual_seed_all(seed)

In [5]:
!git clone https://github.com/khetansarvesh/CV.git

Cloning into 'CV'...
remote: Enumerating objects: 820, done.[K
remote: Counting objects: 100% (285/285), done.[K
remote: Compressing objects: 100% (124/124), done.[K
remote: Total 820 (delta 144), reused 274 (delta 136), pack-reused 535 (from 1)[K
Receiving objects: 100% (820/820), 28.52 MiB | 46.14 MiB/s, done.
Resolving deltas: 100% (441/441), done.


# **Dataset**

In [1]:
# GETTING VOC2007 TRAIN DATASET and EXTRACTING TAR FILES                                                             
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
!tar xf VOCtrainval_06-Nov-2007.tar

--2024-11-05 17:04:56--  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460032000 (439M) [application/x-tar]
Saving to: ‘VOCtrainval_06-Nov-2007.tar’


2024-11-05 17:05:09 (32.2 MB/s) - ‘VOCtrainval_06-Nov-2007.tar’ saved [460032000/460032000]



In [None]:
# GETTING VOC2012 TRAIN DATASET and EXTRACTING TAR FILES                                                               
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
!tar xf VOCtrainval_11-May-2012.tar

--2024-11-05 17:05:17--  http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1999639040 (1.9G) [application/x-tar]
Saving to: ‘VOCtrainval_11-May-2012.tar’


2024-11-05 17:06:13 (34.6 MB/s) - ‘VOCtrainval_11-May-2012.tar’ saved [1999639040/1999639040]



In [None]:
# GETTING VOC2007 TEST DATASET and EXTRACTING TAR FILES                                                             
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar # 
!tar xf VOCtest_06-Nov-2007.tar

In [None]:
## Gettting the images location for 2007 (both train and test) and 2012 (only train) dataset in txt files
!wget https://pjreddie.com/media/files/voc_label.py
!python voc_label.py

In [None]:
# For training we will use train dataset from both 2007 and 2012 and hence we concatenate them and store in a new file called train.txt
!cat 2007_train.txt 2007_val.txt 2012_*.txt > train.txt

# For testing we will only use test from 2007 and we store that in test.txt
!cp 2007_test.txt test.txt

In [None]:
# Move txt files we won't be using to clean up a little bit
mkdir old_txt_files
mv 2007* 2012* old_txt_files/


In [None]:
read_train = open("train.txt", "r").readlines()

with open("train.csv", mode="w", newline="") as train_file:
    for line in read_train:
        image_file = line.split("/")[-1].replace("\n", "")
        text_file = image_file.replace(".jpg", ".txt")
        data = [image_file, text_file]
        writer = csv.writer(train_file)
        writer.writerow(data)

read_train = open("test.txt", "r").readlines()

with open("test.csv", mode="w", newline="") as train_file:
    for line in read_train:
        image_file = line.split("/")[-1].replace("\n", "")
        text_file = image_file.replace(".jpg", ".txt")
        data = [image_file, text_file]
        writer = csv.writer(train_file)
        writer.writerow(data)


In [None]:
mkdir data
mkdir data/images
mkdir data/labels

cp VOCdevkit/*.jpg data/images/
cp VOCdevkit/VOC2007/labels/*.txt data/labels/
cp VOCdevkit/VOC2012/labels/*.txt data/labels/

mkdir data                                                                              
mkdir data/images                                                                       
mkdir data/labels                                                                       
                                                                                        
mv VOCdevkit/VOC2007/JPEGImages/*.jpg data/images/                                      
mv VOCdevkit/VOC2012/JPEGImages/*.jpg data/images/                                      
mv VOCdevkit/VOC2007/labels/*.txt data/labels/                                          
mv VOCdevkit/VOC2012/labels/*.txt data/labels/ 

# We don't need VOCdevkit folder anymore, can remove
# in order to save some space 
rm -rf VOCdevkit/
mv test.txt old_txt_files/
mv train.txt old_txt_files/

In [None]:
from CV.object_detection.yolo_version1.dataset import VOCDataset
voc = VOCDataset('train')

ImportError: libGL.so.1: cannot open shared object file: No such file or directory

In [None]:
def collate_function(data):
    return list(zip(*data))

train_dataset = DataLoader(voc, batch_size=train_config['batch_size'], shuffle=True, collate_fn=collate_function)

# **Modelling**

In [None]:
class YOLOV1(nn.Module):

    def __init__(self, im_size, num_classes, model_config):
        super(YOLOV1, self).__init__()
        self.im_size = im_size
        self.im_channels = model_config['im_channels']
        self.backbone_channels = model_config['backbone_channels']
        self.yolo_conv_channels = model_config['yolo_conv_channels']
        self.conv_spatial_size = model_config['conv_spatial_size']
        self.leaky_relu_slope = model_config['leaky_relu_slope']
        self.yolo_fc_hidden_dim = model_config['fc_dim']
        self.yolo_fc_dropout_prob = model_config['fc_dropout']
        self.use_conv = model_config['use_conv']
        self.S = model_config['S']
        self.B = model_config['B']
        self.C = num_classes




        ###################
        # Backbone Layers # resnet34 pretrained on 224x224 images from Imagenet
        ###################
        backbone = torchvision.models.resnet34(weights=torchvision.models.ResNet34_Weights.IMAGENET1K_V1)

        self.features = nn.Sequential(
            backbone.conv1,
            backbone.bn1,
            backbone.relu,
            backbone.maxpool,
            backbone.layer1,
            backbone.layer2,
            backbone.layer3,
            backbone.layer4,
        )






        #########################
        # Detection Conv Layers # 4 Conv,Batchnorm,LeakyReLU Layers for Yolo Detection Head
        #########################
        self.conv_yolo_layers = nn.Sequential(
            nn.Conv2d(self.backbone_channels,
                      self.yolo_conv_channels,
                      3,
                      padding=1,
                      bias=False),
            nn.BatchNorm2d(self.yolo_conv_channels),
            nn.LeakyReLU(self.leaky_relu_slope),
            nn.Conv2d(self.yolo_conv_channels,
                      self.yolo_conv_channels,
                      3,
                      stride=2,
                      padding=1,
                      bias=False),
            nn.BatchNorm2d(self.yolo_conv_channels),
            nn.LeakyReLU(self.leaky_relu_slope),
            nn.Conv2d(self.yolo_conv_channels,
                      self.yolo_conv_channels,
                      3,
                      padding=1,
                      bias=False),
            nn.BatchNorm2d(self.yolo_conv_channels),
            nn.LeakyReLU(self.leaky_relu_slope),
            nn.Conv2d(self.yolo_conv_channels,
                      self.yolo_conv_channels,
                      3,
                      padding=1,
                      bias=False),
            nn.BatchNorm2d(self.yolo_conv_channels),
            nn.LeakyReLU(self.leaky_relu_slope)
            )





        #######################
        # Detection Layers #
        '''
        Fc layers with final layer having S*S*(5B+C) output dimensions
        Final layer predicts [
            x_offset_box1,y_offset_box1,sqrt_w_box1,sqrt_h_box1,conf_box1, # box-1 params
            ...,
            x_offset_boxB,y_offset_boxB,sqrt_w_boxB,sqrt_h_boxB,conf_boxB, # box-B params
            p1, p2, ...., pC-1, pC  # class conditional probabilities
        ] for each S*S grid cell
        '''
        #######################
        if self.use_conv:
            self.fc_yolo_layers = nn.Sequential(
                nn.Conv2d(self.yolo_conv_channels, 5 * self.B + self.C, 1),
            )
        else:
            self.fc_yolo_layers = nn.Sequential(
                nn.Flatten(),
                nn.Linear(self.conv_spatial_size * self.conv_spatial_size *
                          self.yolo_conv_channels,
                          self.yolo_fc_hidden_dim),
                nn.LeakyReLU(self.leaky_relu_slope),
                nn.Dropout(self.yolo_fc_dropout_prob),
                nn.Linear(self.yolo_fc_hidden_dim,
                          self.S * self.S * (5 * self.B + self.C)),
            )







    def forward(self, x):
        out = self.features(x)
        out = self.conv_yolo_layers(out)
        out = self.fc_yolo_layers(out)
        if self.use_conv:
            # Reshape conv output to Batch x S x S x (5B+C)
            out = out.permute(0, 2, 3, 1)
        return out


In [None]:
model_config = {
                'im_channels' : 3,
                'backbone_channels' : 512,
                'conv_spatial_size' : 7,
                'yolo_conv_channels' : 1024,
                'leaky_relu_slope' : 0.1,
                'fc_dim' : 4096,
                'fc_dropout' : 0.5,
                'S' : 7,
                'B' : 2,
                'use_sigmoid' : True,
                'use_conv' : True
                }

yolo_model = YOLOV1(im_size=dataset_config['im_size'],
                    num_classes=dataset_config['num_classes'],
                    model_config=model_config)

# **Training**

In [None]:
yolo_model.train()
yolo_model.to(device)
if os.path.exists(os.path.join(train_config['task_name'],
                                train_config['ckpt_name'])):
    print('Loading checkpoint as one exists')
    yolo_model.load_state_dict(torch.load(
        os.path.join(train_config['task_name'],
                        train_config['ckpt_name']),
        map_location=device))
if not os.path.exists(train_config['task_name']):
    os.mkdir(train_config['task_name'])

optimizer = torch.optim.SGD(lr=train_config['lr'],
                            params=filter(lambda p: p.requires_grad,
                                            yolo_model.parameters()),
                            weight_decay=5E-4,
                            momentum=0.9)

In [None]:
scheduler = MultiStepLR(optimizer, milestones=train_config['lr_steps'], gamma=0.5)

from CV.object_detection.yolo_version1.loss import YOLOV1Loss
criterion = YOLOV1Loss()

acc_steps = train_config['acc_steps']
num_epochs = train_config['num_epochs']
steps = 0

In [None]:

for epoch_idx in range(num_epochs):
    losses = []
    optimizer.zero_grad()
    for idx, (ims, targets, _) in enumerate(tqdm(train_dataset)):
        yolo_targets = torch.cat([
            target['yolo_targets'].unsqueeze(0).float().to(device)
            for target in targets], dim=0)
        im = torch.cat([im.unsqueeze(0).float().to(device) for im in ims], dim=0)
        yolo_preds = yolo_model(im)
        loss = criterion(yolo_preds, yolo_targets, use_sigmoid=model_config['use_sigmoid'])
        loss = loss / acc_steps
        loss.backward()
        losses.append(loss.item())
        if (idx + 1) % acc_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        if steps % train_config['log_steps'] == 0:
            print('Loss : {:.4f}'.format(np.mean(losses)))
        if torch.isnan(loss):
            print('Loss is becoming nan. Exiting')
            exit(0)
        steps += 1
    print('Finished epoch {}'.format(epoch_idx+1))
    optimizer.step()
    optimizer.zero_grad()
    scheduler.step()
    torch.save(yolo_model.state_dict(), os.path.join(train_config['task_name'],
                                                        train_config['ckpt_name']))
print('Done Training...')

# **Inference**

In [None]:
from CV.object_detection.yolo_version1.infer import infer, evaluate_map

In [None]:
infer(args)

In [None]:
evaluate_map(args)