# **URBE *Perception*** 🚘 - *real-time vehicle detection for self-driving cars in Rome*

> *Refer to the notebook* [![📔](https://colab.reasearch.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sCqnwYm9Dodk1YodD1asVpRMBBdT-8r1#scrollTo=4teaWmm61Fbl) *on **dataset** creation if you haven't already.*

## Introduction

In our case for 2D Object detection: we'll detect mostly *vehicles*, *pedestrians* and *motorbikes*. One of the most important task in computer vision and since now it reaches very high accuracy scores, we'll focus on the efficiency/latency part: how much is the inference time of the model? Is it capable of succeding in real-time tasks? 

## Imports & Downloads

In [None]:
from src.hyperparameters import Hparams
# from sbert.baseline import SentenceBERT
# from sbert.regression_model import execute_booknlp_pipeline
# from sbert.regression_model import count_event_sentence
# from sbert.regression_model import LengthRegressionModel
from src.data_module import URBE_Dataset, URBE_DataModule
from src.model import URBE_Perception
# from src.train import train_model

import dataclasses
from dataclasses import asdict
import matplotlib.pyplot as plt
import wandb
import pprint
import json
import torchvision
import torchvision.transforms as T
import pytorch_lightning as pl
import gc
from collections import Counter
import seaborn as sns
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
from math import comb
import random
import json
from datasets import load_metric
# reproducibility stuff
import numpy as np
import random
import torch
np.random.seed(0)
random.seed(0)
torch.cuda.manual_seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True  # Note that this Deterministic mode can have a performance impact
torch.backends.cudnn.benchmark = False
_ = pl.seed_everything(0)
# to have a better workflow using notebook https://stackoverflow.com/questions/5364050/reloading-submodules-in-ipython
# these commands allow to update the .py codes imported instead of re-importing everything every time.
%load_ext autoreload
%autoreload 2
#%env WANDB_NOTEBOOK_NAME = ./notebook.ipynb
gc.collect()

In [None]:
# login wandb to have the online logger. It is really useful since it stores all the plots and evolution of the model
# check also https://docs.wandb.ai/guides/integrations/lightning
wandb.login()

## Utilities

## Dataset

In [None]:
# fast check to see if all the data were correctly imported
print("We should get the same number in both three cases...")
l1 = os.listdir("dataset/URBE_dataset/images/train") + os.listdir("dataset/URBE_dataset/images/test") + os.listdir("dataset/URBE_dataset/images/val")
print(len(l1))

d=json.load(open("dataset/URBE_dataset/labels/COCO/annotations.json"))
l2 = list(set([ann["image_id"] for ann in d["annotations"]]))
print(len(l2))

l3 = d["images"]
print(len(l3))

In [None]:
hparams = asdict(Hparams())
URBE_Data = URBE_DataModule(hparams)
# to setup it takes ~6 minutes
URBE_Data.setup()
print(len(URBE_Data.data_train)) # --> 3500 images
print(len(URBE_Data.data_val)) # -->  438 images
print(len(URBE_Data.data_test)) # -->  438 images
print("TOTAL: "+str(len(URBE_Data.data_train)+len(URBE_Data.data_val)+len(URBE_Data.data_test))+" images")

### Bounding Boxes Visualization

It's needed of course for showing the results at the end of the project and during  training of the validation set, but it was essential in the *data processing* phase for understanding the qualities of the datasets' bounding boxes annotations and in general to recognize each different characteristic of the data. <br> *(I tried **Scalabel**, **FiftyOne**, but **WandB** is the best choice)* 

> Let's test the *dataloaders* and see some samples from a training batch!

In [None]:
def draw_bbox(label):
  ris = { "predictions" : {"box_data" : [] , "class_labels" : {0 : "vehicle" , 1 : "person", 2 : "motorbike"}} }
  for ann in label: # for each bbox of the particular image
    if ann==[]: # we appended empty lists for having the same batch size for all the samples!
      break
    position = {"minX": ann[0], "maxX": ann[0] + ann[2], "minY": ann[1], "maxY": ann[1] + ann[3]}
    class_id = ann[4]
    box_caption = ris["predictions"]["class_labels"][class_id]
    x = {"position" : position, "domain" : "pixel", "class_id" : class_id, "box_caption" : box_caption}
    ris["predictions"]["box_data"].append(x)
  return ris

In [None]:
# we take one batch from the training set
batch = next(iter(URBE_Data.train_dataloader()))

user_name = "lavallone"
project_name = "VISIOPE_project"
version_name = "prova"
run = wandb.init(entity=user_name, project=project_name, name = version_name, mode = "online")

transform = T.ToPILImage()
images_list = [transform(img) for img in batch["img"]]

my_data = []
for i,label in enumerate(batch["labels"]):
    bbox_list = draw_bbox(label) # label is a list of lists
    my_data.append([batch["id"][i], wandb.Image(images_list[i], boxes=bbox_list)])
table = wandb.Table(columns=['ID', 'Image'], data=my_data)
print("logging the table...")
wandb.log({"dataloaders testing": table})

### Statistics 📊

Before starting with the real development of the detection system, we want to plot the statistics of our data. 
> Since using the dataloaders  for all our dataset is costly and painful, it will be use the "*annotations.json*" file as a source for the dataset statistics.



In [None]:
# function for plotting data --> three group because train/val/test
def three_group_bar(columns, data, title, percentage=True): # both columns and data are lists (data is list of a single list)
  labels = columns
  
  train = data[0]
  val = data[1]
  test = data[2]
  
  color_list = []
  for _ in range(len(data)):
    color = [random.randrange(0, 255)/255, random.randrange(0, 255)/255, random.randrange(0, 255)/255, 1]
    color_list.append(color)
    
  x = np.arange(len(labels))
  width = 0.15  # the width of the bars
  fig, ax = plt.subplots(figsize=(12, 5), layout='constrained')
  rects1 = ax.bar(x - width, train, width, label='Train', color=color_list[0])
  rects2 = ax.bar(x, val, width, label='Val', color=color_list[1])
  rects3 = ax.bar(x + width, test, width, label='Test', color=color_list[2])
  # Add some text for labels, title and custom x-axis tick labels, etc.
  ax.set_title(title)
  ax.set_xticks(x, labels)
  ax.legend()
  if percentage:
    rects1_labels = [('%.2f' % i) + "%" for i in train]
    rects2_labels = [('%.2f' % i) + "%" for i in val]
    rects3_labels = [('%.2f' % i) + "%" for i in test]
  else:
    rects1_labels = train
    rects2_labels = val
    rects3_labels = test
  
  ax.bar_label(rects1, rects1_labels, padding=3)
  ax.bar_label(rects2, rects2_labels, padding=3)
  ax.bar_label(rects3, rects3_labels, padding=3)

In [None]:
# setup
d = json.load(open("dataset/URBE_dataset/labels/COCO/annotations.json"))
annotations = d["annotations"]
images = d["images"]

train_image_id_list = [f.split("_")[-1][:-4] for f in os.listdir("dataset/URBE_dataset/images/train/")]
val_image_id_list = [f.split("_")[-1][:-4] for f in os.listdir("dataset/URBE_dataset/images/val/")]
test_image_id_list = [f.split("_")[-1][:-4] for f in os.listdir("dataset/URBE_dataset/images/test/")]

**Number of classes**

In [None]:
data = []

# TRAIN
classes_list = [ann["category_id"] for ann in tqdm(annotations) if ann["image_id"] in train_image_id_list]
c = Counter(classes_list)
tot = c[0] + c[1] + c[2]
data.append([(c[0]/tot)*100, (c[1]/tot)*100, (c[2]/tot)*100])

# VAL
classes_list = [ann["category_id"] for ann in tqdm(annotations) if ann["image_id"] in val_image_id_list]
c = Counter(classes_list)
tot = c[0] + c[1] + c[2]
data.append([(c[0]/tot)*100, (c[1]/tot)*100, (c[2]/tot)*100])

# TEST
classes_list = [ann["category_id"] for ann in tqdm(annotations) if ann["image_id"] in test_image_id_list]
c = Counter(classes_list)
tot = c[0] + c[1] + c[2]
data.append([(c[0]/tot)*100, (c[1]/tot)*100, (c[2]/tot)*100])

In [None]:
#data = [[82.46814899865309, 17.16686171034909, 0.364989290997814], [82.85404948638728, 16.814104764671193, 0.33184574894152646], [82.80439305749428, 16.843565364727365, 0.35204157777836304]]
columns = ["vehicle", "person", "motorbike"]
three_group_bar(columns, data, "train/val/test Classes Distribution")

**Time of the day**

In [None]:
data = []

# TRAIN
time_list = [img["timeofday"] for img in tqdm(images) if img["id"] in train_image_id_list]
c = Counter(time_list)
tot = c["daytime"] + c["Day"] + c["night"] + c["Night"] + c["dawn/dusk"] + c["Dawn/Dusk"]
data.append([ ((c["daytime"]+c["Day"])/tot)*100, ((c["night"]+c["Night"])/tot)*100, ((c["dawn/dusk"]+c["Dawn/Dusk"])/tot)*100 ])

# VAL
time_list = [img["timeofday"] for img in tqdm(images) if img["id"] in val_image_id_list]
c = Counter(time_list)
tot = c["daytime"] + c["Day"] + c["night"] + c["Night"] + c["dawn/dusk"] + c["Dawn/Dusk"]
data.append([ ((c["daytime"]+c["Day"])/tot)*100, ((c["night"]+c["Night"])/tot)*100, ((c["dawn/dusk"]+c["Dawn/Dusk"])/tot)*100 ])

# TEST
time_list = [img["timeofday"] for img in tqdm(images) if img["id"] in test_image_id_list]
c = Counter(time_list)
tot = c["daytime"] + c["Day"] + c["night"] + c["Night"] + c["dawn/dusk"] + c["Dawn/Dusk"]
data.append([ ((c["daytime"]+c["Day"])/tot)*100, ((c["night"]+c["Night"])/tot)*100, ((c["dawn/dusk"]+c["Dawn/Dusk"])/tot)*100 ])

In [None]:
#data = [[57.89522657485811, 35.27379733879222, 6.830976086349678], [58.056361763879785, 34.95927347626627, 6.984364759853946], [58.73741141365162, 34.61395001864976, 6.64863856769862]]
columns = ["day", "night", "dawn/dusk"]
three_group_bar(columns, data, "train/val/test TimeOfDay Distribution")

## Model

We organized the dataset in order to be compatible with the COCO dataset. We did it initially because all the *YOLO* architectures were trained/tested on it.

We'll now focus more on the **YOLOv5** model, considered one of the best ones at the moment in terms of the  *accuracy*/*time inference* trade-off and with a very *pytorch* detailed documentation.

Our goal is to achieve the best performances on our custom "*URBE_dataset*". In order to realize this we need to perform the following steps:
- build a custom YOLOv5 architecture (based on the official repo), to be able to use the *pretrained weights* on the COCO dataset for the **backbone** and the **neck** part. 
- thanks to the **autoanchor** algorithm implemented by Glenn Jocher (one of the authors of YOLOv5), we compute the best new anchors that fit our dataset. This contributes significantly to enhancing the overall model.
- adding the **Mosaic Augmenation** to other basics augmentations techniques to improve the model generalization capability.
- trying to attach the **Decoupled Head** at the end of model (as it was added in YOLOv6 and subsequent architectures) and see if there's an improvement.
- playing around with different versions of the **IoU loss** (GIoU,DIoU or CIoU).

### Autoanchor

Anchors in YOLO models are predefined bounding boxes used to represent the shape and size of the objects in an image. These anchors are used as a reference to compare the predicted bounding boxes from the model with the actual bounding boxes around the objects. 

Glenn Jocher introduced the idea of learning anchor boxes based on the distribution of bounding boxes in the custom dataset with *K-means* and *genetic* learning algorithms. This is very important for custom tasks, because the distribution of bounding box sizes and locations may be dramatically different than the preset bounding box anchors in the COCO dataset. 

> *The autoanchor algorithm is automatically computed before training (train code made publicly available by the YOLOv5 authors) starts.* 

We are indeed going to make the annotations (that are in COCO format) compatible with the "*YOLOv5 text format*". Then we are going to "train" a YOLOv5 architecture on our dataset (but actually we'll only leverage the functionality of autoanchor method). 

In [23]:
import json
import os
from tqdm import tqdm
annotations = json.load(open("dataset/URBE_dataset/labels/COCO/annotations.json", "r"))["annotations"]

# save image_id for each images in dataset/YOLOv5_format/train and also the .txt name where the labels will be written!
image_id_list = []
txt_labels_list = []
for f in os.listdir("dataset/YOLOv5_format/images/1"):
    image_id = f.split("_")[-1][:-4]
    txt_labels_name = f[:-4]+".txt"
    image_id_list.append(image_id)
    txt_labels_list.append(txt_labels_name)

# filtering of only the annotations of the images in dataset/YOLOv5_format/train
filter_annotations = list(filter(lambda x: x["image_id"] in image_id_list, annotations))

# for each images in dataset/YOLOv5_format/train
for image_id, txt_labels_name in list(zip(image_id_list, txt_labels_list)):
    image_labels = list( map(lambda x: [x["category_id"], x["bbox"][0], x["bbox"][1], x["bbox"][2], x["bbox"][3]], list(filter(lambda x: x["image_id"] == image_id, filter_annotations)) ) )
    
    txt_file_name = "dataset/YOLOv5_format/labels/" + txt_labels_name

    line_to_write = []
    for line in image_labels:
        x1 = float(line[1])
        y1 = float(line[2])
        w = float(line[3])
        h = float(line[4])
        c1 = round(((x1 + w/2) / 1280), 2)
        c2 = round(((y1 + h/2) / 720), 2)
        w = round(w/1280, 2)
        h = round(h/720, 2)
        line_to_write.append(" ".join([str(line[0]), str(c1), str(c2), str(w), str(h)]))
    with open(txt_file_name, 'w') as f:
        f.write("\n".join(line_to_write))

> After downloading the YOLOv5 format dataset on *Roboflow*, we "train" it using the YOLOv5 official code.

### Training

In [None]:
#user_name = "lavallone"
#project_name = "VISIOPE_project"
#version_name = "prova"
#run = wandb.init(entity=user_name, project=project_name, name = version_name, mode = "online")

hparams = asdict(Hparams())
data = URBE_DataModule(hparams)
model = URBE_Perception(hparams)
#trainer = train_model(data, model, experiment_name = version_name, \
#    patience=5, metric_to_monitor="val_ROUGE", mode="max", epochs = 10)

#wandb.finish()

In [None]:
data.setup()
model.to("cuda")
for batch in iter(data.train_dataloader()):
    with torch.no_grad():
        batch["img"] = batch["img"].to("cuda")
        #print(batch["img"].shape)
        x = model(batch["img"])
        print(x[0].shape)
        break

In [None]:
print(model)

In [None]:
s=[[2,3],[3,4],[5,4]]
x=torch.tensor([[ [[2,3],[3,4]], [[2,3],[3,4]], [[2,3],[3,4]] ],
               [ [[2,3],[3,4]], [[2,3],[3,4]], [[2,3],[3,4]] ] ]) 
x.shape

x[..., ::2, ::2]
#(b,c,w,h) -> y(b,4c,w/2,h/2)

# Stuff

In [None]:
import cv2

img = cv2.imread("Screenshot from 2022-12-19 18-32-25.png")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
print(img)
resized_img = cv2.resize(
            img,
            (int(img.shape[1] * 1), int(img.shape[0] * 1)),
            interpolation=cv2.INTER_LINEAR,
        ).astype(np.uint8)
print(resized_img.shape)

In [None]:
import torch

# Model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

#model = model.model.model.model[:10]
model