[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/CV/blob/main/object_detection/yolo_v1/runner.ipynb)

In [None]:
!git clone https://github.com/khetansarvesh/CV.git

In [None]:
import os
import csv
from tqdm import tqdm

# pytorch library
import torch
import torchvision.transforms as transforms
import torchvision.transforms.functional as FT
from torch.utils.data import DataLoader
import torch.nn as nn
import torchvision

In [None]:
DEVICE = "cuda" if torch.cuda.is_available else "cpu"
print(DEVICE)

# **Dataset**

In [None]:
# GETTING VOC2007 TRAIN DATASET and EXTRACTING TAR FILES
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
!tar xf VOCtrainval_06-Nov-2007.tar

# GETTING VOC2012 TRAIN DATASET and EXTRACTING TAR FILES
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
!tar xf VOCtrainval_11-May-2012.tar

# GETTING VOC2007 TEST DATASET and EXTRACTING TAR FILES
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar #
!tar xf VOCtest_06-Nov-2007.tar

# Gettting the images location for 2007 (both train and test) and 2012 (only train) in txt files
!wget https://pjreddie.com/media/files/voc_label.py
!python voc_label.py

In [None]:
# creating a new folder data which will contain all the images and labels
!mkdir data
!mkdir data/images
!mkdir data/labels

!mv VOCdevkit/VOC2007/JPEGImages/*.jpg data/images/
!mv VOCdevkit/VOC2012/JPEGImages/*.jpg data/images/
!mv VOCdevkit/VOC2007/labels/*.txt data/labels/
!mv VOCdevkit/VOC2012/labels/*.txt data/labels/

Creating Training Dataset

In [None]:
# training data will consist of 2007_train + 2007_val + 2012_train + 2012_val, concatenating all of these and storing in train.txt
!cat 2007_train.txt 2007_val.txt 2012_*.txt > train.txt

# converting this traing.txt file to csv file
read_train = open("train.txt", "r").readlines()
with open("train.csv", mode="w", newline="") as train_file:
    for line in read_train:
        image_file = line.split("/")[-1].replace("\n", "")
        text_file = image_file.replace(".jpg", ".txt")
        data = [image_file, text_file]
        writer = csv.writer(train_file)
        writer.writerow(data)

In [None]:
from CV.object_detection.yolo_v1.dataset import VOCDataset
train_dataset = VOCDataset("train.csv", img_dir="data/images", label_dir="data/labels")

# creating training dataloader
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=16,
    num_workers=2,
    pin_memory=True,
    shuffle=True,
    drop_last=True)

Creating Testing Dataset

In [None]:
# testing data will consist of 2007_test
!cp 2007_test.txt test.txt

# converting this test.txt file to csv file
read_train = open("test.txt", "r").readlines()
with open("test.csv", mode="w", newline="") as train_file:
    for line in read_train:
        image_file = line.split("/")[-1].replace("\n", "")
        text_file = image_file.replace(".jpg", ".txt")
        data = [image_file, text_file]
        writer = csv.writer(train_file)
        writer.writerow(data)

In [None]:
# testing dataset
test_dataset = VOCDataset("test.csv", img_dir="data/images", label_dir="data/labels")

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    num_workers=2,
    pin_memory=True,
    shuffle=True,
    drop_last=True)

Cleaning Up to save some ram space

In [None]:
! rm 2007* 2012* *.txt *.tar *.py
! rm -rf VOCdevkit/

# **Modelling**

In [None]:
class YOLOV1(nn.Module):

    def __init__(self):
        super(YOLOV1, self).__init__()

        backbone = torchvision.models.resnet34(weights=torchvision.models.ResNet34_Weights.IMAGENET1K_V1)
        self.features = nn.Sequential(
            backbone.conv1,
            backbone.bn1,
            backbone.relu,
            backbone.maxpool,
            backbone.layer1,
            backbone.layer2,
            backbone.layer3,
            backbone.layer4,
        )


        self.conv_yolo_layers = nn.Sequential(
            nn.Conv2d(512, 1024, 3, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1), #512 cause resnet gives 512 channel output
            nn.Conv2d(1024, 1024, 3, stride=2, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, 3, padding=1, bias=False),nn.BatchNorm2d(1024),nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, 3, padding=1, bias=False),nn.BatchNorm2d(1024),nn.LeakyReLU(0.1)
            )

        self.fc_yolo_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(7 * 7 * 1024, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),
            nn.Linear(4096, 7 * 7 * (5 * 2 + 20)),
        )
        # instead of this you could have also used a 1*1 convolution layer as follows
        # self.fc_yolo_layers = nn.Sequential( nn.Conv2d(1024, 5 * self.B + self.C, 1))

    def forward(self, x):
        out = self.features(x)
        out = self.conv_yolo_layers(out)
        out = self.fc_yolo_layers(out)
        return out

In [None]:
class YOLOV1(nn.Module):

    def __init__(self):
        super(YOLOV1, self).__init__()

        S = 7
        C = 20
        B = 2


        self.darknet = nn.Sequential(

        nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False), nn.BatchNorm2d(64), nn.LeakyReLU(0.1),
        nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

        nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(192), nn.LeakyReLU(0.1),
        nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

        nn.Conv2d(192, 128, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(128), nn.LeakyReLU(0.1),
        nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(256), nn.LeakyReLU(0.1),
        nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(256), nn.LeakyReLU(0.1),
        nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(512), nn.LeakyReLU(0.1),
        nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

        nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(256), nn.LeakyReLU(0.1),
        nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(512), nn.LeakyReLU(0.1),
        nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(256), nn.LeakyReLU(0.1),
        nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(512), nn.LeakyReLU(0.1),
        nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(256), nn.LeakyReLU(0.1),
        nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(512), nn.LeakyReLU(0.1),
        nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(256), nn.LeakyReLU(0.1),
        nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(512), nn.LeakyReLU(0.1),
        nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(512), nn.LeakyReLU(0.1),
        nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
        nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

        nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(512), nn.LeakyReLU(0.1),
        nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
        nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(512), nn.LeakyReLU(0.1),
        nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),


        nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
        nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
        nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
        nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
        )

        self.fcs = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S * S * (C + B * 5)),
        )

    def forward(self, x):
        x = self.darknet(x)
        x = torch.flatten(x, start_dim=1)
        x = self.fcs(x)
        return x

In [None]:
model = YOLOV1().to(DEVICE)

# **Training**

In [None]:
# import dependies
from CV.object_detection.yolo_v1.loss import YoloLoss
from CV.object_detection.yolo_v1.mean_avg_precision import mean_average_precision
from CV.object_detection.yolo_v1.nms import non_max_suppression
from CV.object_detection.yolo_v1.utils import (plot_image, convert_cellboxes, cellboxes_to_boxes)

# Setting Seed
seed = 123
torch.manual_seed(seed)

# Setting Optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=0)

In [None]:
# def get_bboxes(loader, model, iou_threshold, threshold, pred_format="cells", box_format="midpoint", device="cuda",):
#     all_pred_boxes = []
#     all_true_boxes = []

#     # make sure model is in eval before get bboxes
#     model.eval()
#     train_idx = 0

#     for batch_idx, (x, labels) in enumerate(loader):
#         x = x.to(device)
#         labels = labels.to(device)

#         with torch.no_grad():
#             predictions = model(x)

#         batch_size = x.shape[0]
#         true_bboxes = cellboxes_to_boxes(labels)
#         bboxes = cellboxes_to_boxes(predictions)

#         for idx in range(batch_size):
#             nms_boxes = non_max_suppression(
#                 bboxes[idx],
#                 iou_threshold=iou_threshold,
#                 threshold=threshold,
#                 box_format=box_format,
#             )


#             #if batch_idx == 0 and idx == 0:
#             #    plot_image(x[idx].permute(1,2,0).to("cpu"), nms_boxes)
#             #    print(nms_boxes)

#             for nms_box in nms_boxes:
#                 all_pred_boxes.append([train_idx] + nms_box)

#             for box in true_bboxes[idx]:
#                 # many will get converted to 0 pred
#                 if box[1] > threshold:
#                     all_true_boxes.append([train_idx] + box)

#             train_idx += 1

#     model.train()
#     return all_pred_boxes, all_true_boxes

In [None]:
for epoch in range(1000):

    #pred_boxes, target_boxes = get_bboxes(train_loader, model, iou_threshold=0.5, threshold=0.4)
    #mean_avg_prec = mean_average_precision(pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint")
    #print(f"Train mAP: {mean_avg_prec}")

    mean_loss = []
    loop = tqdm(train_loader, leave=True)

    for batch_idx, (x, y) in enumerate(loop):

        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)

        loss = YoloLoss(7,2,20,out, y)

        mean_loss.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update progress bar
        loop.set_postfix(loss=loss.item())

    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")

In [None]:

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])


# Inference