# DETR: End-to-End Object Detection with Transformers

- Paper: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
- Official video: [DETR - End to end object detection with transformers (ECCV2020)](https://youtu.be/utxbUlo9CyY?si=cIHkqMDQj7rMrTCk)

## Import libraries

In [None]:
import torch
import torchvision
from torch import nn
from torchvision import datasets, transforms
from torchvision.models import resnet50
from torchinfo import summary
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import os
import matplotlib.pyplot as plt
import cv2
import pandas as pd
import numpy as np
from pathlib import Path
import itertools

print(f"torch version: {torch.__version__}")
print(f"torchvision version: {torchvision.__version__}")

In [None]:
# Custom imports
import dataloader
import config
import utils
import engine
import model

## Select device

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

## Parameters configuration

In [None]:
DATASET_DIR = config.DATASET_DIR
PARENT_DIR = config.PARENT_DIR

IMAGE_HEIGHT = config.IMAGE_HEIGHT
IMAGE_WIDTH = config.IMAGE_WIDTH

PIN_MEMORY = config.PIN_MEMORY
LEARNING_RATE = config.LEARNING_RATE
BATCH_SIZE = config.BATCH_SIZE
NUM_EPOCHS = config.NUM_EPOCHS
NUM_WORKERS = config.NUM_WORKERS

SEED = config.SEED
PRINT_MODEL_SUMMARY = config.PRINT_MODEL_SUMMARY

In [None]:
# Setup path to data folder
data_path = Path(DATASET_DIR)
image_path = data_path / "images"
label_path = data_path / "labels"
class_names_file = data_path / "class_names.txt"

train_csv_file = data_path / "train.csv"
test_csv_file = data_path / "test.csv"

## Download dataset

In [None]:
# Read class names txt file
with open(class_names_file, "r") as f:
    classes = [class_name.strip() for class_name in f.readlines()]

# Create class to index dictionary
class_to_idx = {int(class_name.split(":")[0]): class_name.split(":")[1].strip().strip("'") for i, class_name in enumerate(classes)}

NUM_CLASSES = len(classes)

print(f"Number of classes: {len(classes)}")
print(f"Class to index: {class_to_idx}")

### Read train CSV

In [None]:
df = pd.read_csv(train_csv_file, header=None, names=["img", "label"])
print(df.head(10))

## Create dataloader

In [None]:
# Albumentations transforms
train_transform = A.Compose([
    A.Resize(224, 224),
    A.HorizontalFlip(p=0.5),
    A.RandomCrop(224, 224, p=0.5),
    A.ColorJitter(p=0.3),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()],
    bbox_params=A.BboxParams(format='yolo', label_fields=['labels']))

test_transform = A.Compose([
    A.Resize(224, 224),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()],
    bbox_params=A.BboxParams(format='yolo', label_fields=['labels'])
)

train_dataloader, val_dataloader = dataloader.get_loaders(dataset_dir=train_csv_file,
                                image_dir=image_path,
                                label_dir=label_path,
                                batch_size=BATCH_SIZE,
                                transform=train_transform,
                                set_type="train",
                                num_workers=NUM_WORKERS,
                                pin_memory=True)

test_dataloader = dataloader.get_loaders(dataset_dir=test_csv_file,
                                image_dir=image_path,
                                label_dir=label_path,
                                batch_size=1,
                                transform=test_transform,
                                set_type="test",
                                num_workers=NUM_WORKERS,
                                pin_memory=True)

## Visualize samples from the dataloader

In [None]:
# Create a 3x3 subplot
fig, axs = plt.subplots(3, 3, figsize=(15, 15))
colors = utils.get_color_map()

# Iterate over the first 9 images from the dataloader
for i, (image, target) in enumerate(itertools.islice(val_dataloader, 9)):
    # Convert the image tensor to a numpy array and denormalize it
    image = image[0].permute(1, 2, 0).detach().cpu().numpy()
    image = (image * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])) * 255
    image = image.astype(np.uint8)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    bboxes = target['boxes'].numpy()[0]
    labels = target['labels'].numpy()[0]

    height, width = image.shape[:2]

    # Draw bboxes
    for bbox, label in zip(bboxes, labels):
        x, y, w, h = bbox

        x *= width
        y *= height
        w *= width
        h *= height

        left = int(x - w / 2)
        top = int(y - h / 2)

        cv2.rectangle(image, (int(left), int(top)), (int(left + w), int(top + h)), colors[int(label)], 2)

        text = f'{class_to_idx[int(label)]}'
        text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        cv2.rectangle(image, (left, top - text_size[1] - 5), (left + text_size[0] + 5, top), colors[int(label)], -1)
        cv2.putText(image, text, (left + 2, top - 5), cv2.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 0), 1)

    # Get current subplot
    ax = axs[i // 3, i % 3]

    # Show the image, converted to RGB for matplotlib
    ax.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    ax.axis('off')

plt.tight_layout()
plt.show()

## DETR Architecture

### Overview
<img src="notebook_images/detr_overall.png" width="900">
<br></br>

### Output Matching
<img src="notebook_images/detr_output_matching.png" width="900">
<br></br>


### Architecture
<img src="notebook_images/detr_architecture.png" width="600">

## Model Architecture

In [None]:
net = model.DETRdemo(num_classes=20).to(DEVICE)

In [None]:
# Summary
if PRINT_MODEL_SUMMARY:
        from torchinfo import summary

        summary(model=net,
                input_size=(1, 3, 224, 224), # (batch_size, channels, height, width)
                col_names=["input_size", "output_size", "num_params", "trainable"],
                col_width=20,
                row_settings=["var_names"])

## Loss and optimizer

In [None]:
from loss_fn import SetCriterion
from hungarian_matcher import HungarianMatcher

matcher = HungarianMatcher()

weight_dict = weight_dict = {'loss_ce': 1, 'loss_bbox': 1 , 'loss_giou': 1}
losses = ['labels', 'boxes', 'cardinality']
NULL_CLASS_COEF = 0.5

criterion = SetCriterion(NUM_CLASSES-1, matcher, weight_dict, eos_coef = NULL_CLASS_COEF, losses=losses).to(DEVICE) # eos_coef is used in the output layer to affect the output corresponding to the absence of an object.

optimizer = torch.optim.Adam(params=net.parameters(),
                             lr=LEARNING_RATE,
                             weight_decay=0.1)

## Train model

In [None]:
utils.set_seeds(SEED)

In [None]:
# Train the model
results = engine.train(model=net,
                      train_dataloader=train_dataloader,
                      test_dataloader=test_dataloader,
                      optimizer=optimizer,
                      loss_fn=criterion,
                      epochs=NUM_EPOCHS,
                      device=DEVICE)

In [None]:
# Plot loss and accuracy curves
utils.plot_loss_curves(results=results, save_fig=True)