In [1]:
from transformers import DetrForObjectDetection, DetrImageProcessor
from torch.utils.data import DataLoader
import torch
import supervision as sv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_loc = r"C:\Users\isaac\dev\CV_Garbage_Detection\Data"

ds_train = sv.DetectionDataset.from_yolo(
    images_directory_path=f"{dataset_loc}/train/images",
    annotations_directory_path=f"{dataset_loc}/train/labels",
    data_yaml_path=f"{dataset_loc}/data.yaml",
)

ds_valid = sv.DetectionDataset.from_yolo(
    images_directory_path=f"{dataset_loc}/test/images",
    annotations_directory_path=f"{dataset_loc}/test/labels",
    data_yaml_path=f"{dataset_loc}/data.yaml",
)

ds_test = sv.DetectionDataset.from_yolo(
    images_directory_path=f"{dataset_loc}/valid/images",
    annotations_directory_path=f"{dataset_loc}/valid/labels",
    data_yaml_path=f"{dataset_loc}/data.yaml",
)

print(f"Number of training images: {len(ds_train)}")
print(f"Number of testing images: {len(ds_test)}")
print(f"Number of validation images: {len(ds_valid)}")

Number of training images: 4200
Number of testing images: 1704
Number of validation images: 100


In [17]:
class SVDetectionDataset(torch.utils.data.Dataset):
    def __init__(self, detection_dataset, processor):
        self.detection_dataset = detection_dataset
        self.processor = processor

    def __len__(self):
        return len(self.detection_dataset)

    def __getitem__(self, idx):
        # sv.DetectionDataset returns a tuple: (image, bboxes, category_ids, metadata)
        image, bboxes, category_ids, = self.detection_dataset[idx]

        # Prepare target
        target = {"image_id": torch.tensor([idx])}
        annotations = [
            {
                "bbox": bbox,
                "category_id": cat_id,
                "area": (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]),
                "iscrowd": 0,
            }
            for bbox, cat_id in zip(bboxes, category_ids)
        ]
        target["annotations"] = annotations

        # Preprocess image and target
        encoding = self.processor(images=image, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()  # Remove batch dimension
        labels = encoding["labels"][0]  # Extract target labels

        return pixel_values, labels

In [10]:
# Load pre-trained DETR model
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# Wrap in SVDetectionDataset for use with DETR
train_dataset = SVDetectionDataset(ds_train, processor)
valid_dataset = SVDetectionDataset(ds_valid, processor)

# DataLoader setup
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
valid_dataloader = DataLoader(valid_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

In [19]:
num_classes = len(ds_train.classes)  # Get the number of classes in your dataset

# Update the classification head for your dataset
model.class_labels_classifier = torch.nn.Linear(model.config.hidden_size, num_classes + 1)  # +1 for background class

In [13]:
from torch.optim import AdamW

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 10
for epoch in range(epochs):
    model.train()
    for i, (pixel_values, labels) in enumerate(train_dataloader):
        pixel_values = torch.stack(pixel_values).to(device)
        labels = [{k: v.to(device) for k, v in t.items()} for t in labels]

        # Forward pass
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f"Epoch {epoch}, Step {i}, Loss: {loss.item()}")

ValueError: not enough values to unpack (expected 4, got 3)