In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import supervision as sv
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor, AutoModelForObjectDetection, AutoImageProcessor
import torchvision.transforms as T
import albumentations as A
from dataclasses import replace



In [2]:
CHECKPOINT = "PekingU/rtdetr_r50vd_coco_o365"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
model = AutoModelForObjectDetection.from_pretrained(CHECKPOINT).to(DEVICE)
processor = AutoImageProcessor.from_pretrained(CHECKPOINT)

In [4]:
ds_path = r"C:\Users\isaac\dev\CV_Garbage_Detection\Data"

ds_train = sv.DetectionDataset.from_coco(
    images_directory_path=os.path.join(ds_path, "train"),
    annotations_path=os.path.join(ds_path, "train", "_annotations.coco.json"),
)

ds_test = sv.DetectionDataset.from_coco(
    images_directory_path=os.path.join(ds_path, "test"),
    annotations_path=os.path.join(ds_path, "test", "_annotations.coco.json"),
)

ds_valid = sv.DetectionDataset.from_coco(
    images_directory_path=os.path.join(ds_path, "valid"),
    annotations_path=os.path.join(ds_path, "valid", "_annotations.coco.json"),
)

In [5]:
augmentation_train = A.Compose(
    [
        A.Perspective(p=0.1),
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.5),
        A.HueSaturationValue(p=0.1),
    ],
    bbox_params=A.BboxParams(
        format="pascal_voc",
        label_fields=["category"],
        clip=True,
        min_area=25
    ),
)

augmentation_valid = A.Compose(
    [A.NoOp()],
    bbox_params=A.BboxParams(
        format="pascal_voc",
        label_fields=["category"],
        clip=True,
        min_area=1
    ),
)

In [6]:
IMAGE_COUNT = 5

for i in range(IMAGE_COUNT):
    _, image, annotations = ds_train[i]

    output = augmentation_train(
        image=image,
        bboxes=annotations.xyxy,
        category=annotations.class_id
    )

    augmented_image = output["image"]
    augmented_annotations = replace(
        annotations,
        xyxy=np.array(output["bboxes"]),
        class_id=np.array(output["category"])
    )
    

In [7]:
class AugmentedDetectionDataset(Dataset):
    def __init__(self, dataset, processor, transform):
        self.dataset = dataset
        self.processor = processor
        self.transform = transform

    @staticmethod
    def annotations_as_coco(image_id, categories, boxes):
        ...

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        _, image, annotations = self.dataset[idx]

        image = image[:, :, ::-1]
        transformed = self.transform(
            image=image,
            bboxes=annotations.xyxy,
            category=annotations.class_id
        )
        image = transformed["image"]
        boxes = transformed["bboxes"]
        categories = transformed["category"]

        formatted_annotations = self.annotations_as_coco(
            image_id=idx, 
            categories=categories, 
            boxes=boxes
        )
        result = self.processor(
            images=image, 
            annotations=formatted_annotations, 
            return_tensors="pt"
        )

        return {k: v[0] for k, v in result.items()}

In [8]:
augmented_dataset_train = AugmentedDetectionDataset(
    ds_train, processor, transform=augmentation_train)
augmented_dataset_valid = AugmentedDetectionDataset(
    ds_valid, processor, transform=augmentation_valid)
augmented_dataset_test = AugmentedDetectionDataset(
    ds_test, processor, transform=augmentation_valid)

In [9]:
def collate_fn(batch):
    data = {}
    data["pixel_values"] = torch.stack([
        x["pixel_values"] 
        for x 
        in batch]
    )
    data["labels"] = [x["labels"] for x in batch]
    return data

In [10]:
id2label = {id: label for id, label in enumerate(ds_train.classes)}
label2id = {label: id for id, label in enumerate(ds_train.classes)}

model = AutoModelForObjectDetection.from_pretrained(
    CHECKPOINT,
    id2label=id2label,
    label2id=label2id,
    anchor_image_size=None,
    ignore_mismatched_sizes=True,
)

Some weights of RTDetrForObjectDetection were not initialized from the model checkpoint at PekingU/rtdetr_r50vd_coco_o365 and are newly initialized because the shapes did not match:
- model.decoder.class_embed.0.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([19]) in the model instantiated
- model.decoder.class_embed.0.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([19, 256]) in the model instantiated
- model.decoder.class_embed.1.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([19]) in the model instantiated
- model.decoder.class_embed.1.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([19, 256]) in the model instantiated
- model.decoder.class_embed.2.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([19]) in the model instantiated
- model.decoder.class_embed.2.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([19, 256]) in the model instantiated
- 