In [1]:
# https://huggingface.co/docs/transformers/tasks/object_detection

In [2]:
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "5, 6, 7"
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [3]:
from datasets import load_dataset
import os

base_dir = '/usr/users/henrich1/exercises_summer_school/data/object_detection'
train_path = os.path.join(base_dir, 'train.csv')
val_path = os.path.join(base_dir, 'val.csv')
data_files = {
    'train': train_path,
    'val': val_path
}

dataset = load_dataset("csv", data_files=data_files)
print(dataset['train'])
print(dataset['val'])

id2label = {0: 'pig'}
label2id = {'pig': 0}

Dataset({
    features: ['image_id', 'image_name'],
    num_rows: 873
})
Dataset({
    features: ['image_id', 'image_name'],
    num_rows: 98
})


In [4]:
from transformers import AutoImageProcessor, YolosImageProcessor

checkpoint = "facebook/detr-resnet-50"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [5]:
import albumentations
import numpy as np

transform = albumentations.Compose(
    [
        # albumentations.Resize(400, 640),
        albumentations.HorizontalFlip(p=0.5),
        albumentations.RandomBrightnessContrast(p=0.5),
    ],
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)

In [6]:
def formatted_anns(image_id, category, area, bbox):
    annotations = []
    for i in range(len(category)):

        new_ann = {
            "image_id": image_id,
            "category_id": category[i],
            "isCrowd": 0, # no background class
            "area": area[i],
            "bbox": list(bbox[i]),
        }
        annotations.append(new_ann)

    return annotations

In [7]:
from PIL import Image

# transforming a batch # HIER WEITER
def transform_aug_ann(examples, base_dir):
    images_dir = os.path.join(base_dir, 'images')
    labels_dir = os.path.join(base_dir, 'labels')
    image_ids = examples["image_id"]
    images, bboxes, areas, categories = [], [], [], []
    
    for image_name in examples["image_name"]:
        image = Image.open(os.path.join(images_dir, image_name + '.jpg'))
        image = np.array(image.convert("RGB"))[:, :, ::-1]
        
        bbox = np.loadtxt(os.path.join(labels_dir, image_name + '.txt'))
        if bbox.ndim == 1:
            bbox = bbox[None, :]
        category = [0 for _ in range(len(bbox))] # only one class
        out = transform(image=image, bboxes=bbox, category=category)
        area = np.array(out["bboxes"])[:, 2] * np.array(out["bboxes"])[:, 3]

        areas.append(area)
        images.append(out["image"])
        bboxes.append(out["bboxes"])
        categories.append(out["category"])

    targets = [
        {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
        for id_, cat_, ar_, box_ in zip(image_ids, categories, areas, bboxes)
    ]

    return image_processor(images=images, annotations=targets, return_tensors="pt")

In [8]:
dataset['train'] = dataset['train'].with_transform(lambda examples: transform_aug_ann(examples, base_dir))


In [9]:
dataset['train'][0]
# dataset['train'][0]['labels']['orig_size']

{'pixel_values': tensor([[[ 0.1597,  0.0227, -0.0458,  ...,  0.5364,  0.5707,  0.5536],
          [-0.3027, -0.1657,  0.0569,  ...,  0.3994,  0.6563,  0.8447],
          [ 0.1597,  0.2111,  0.2796,  ...,  0.4508,  0.6049,  0.6734],
          ...,
          [-1.1932, -1.1932, -1.1932,  ..., -0.7308, -1.0390, -0.9020],
          [-1.2103, -1.2103, -1.2103,  ..., -0.7650, -0.9020, -0.9363],
          [-1.2103, -1.2103, -1.2103,  ..., -0.8507, -0.7993, -1.0048]],
 
         [[ 0.2752,  0.1352,  0.0651,  ...,  0.1001,  0.0301, -0.0574],
          [-0.1975, -0.0574,  0.1527,  ..., -0.0224,  0.1001,  0.2227],
          [ 0.2752,  0.3102,  0.3978,  ...,  0.0476,  0.0651,  0.1001],
          ...,
          [-0.7402, -0.7402, -0.7402,  ..., -0.5301, -0.8803, -0.7927],
          [-0.7752, -0.7752, -0.7752,  ..., -0.5651, -0.7402, -0.8277],
          [-0.7752, -0.7752, -0.7752,  ..., -0.6702, -0.6527, -0.8978]],
 
         [[-0.1661, -0.3055, -0.3927,  ...,  0.0431, -0.0092, -0.0615],
          [-

In [10]:
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    batch["labels"] = labels
    return batch

In [11]:
from transformers import AutoModelForObjectDetection

model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model checkpoin

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="detr-resnet-50_finetuned_pigs",
    per_device_train_batch_size=6,
    num_train_epochs=50,
    fp16=True,
    save_steps=200,
    logging_steps=200,
    learning_rate=1e-4,
    weight_decay=1e-4,
    save_total_limit=20,
    remove_unused_columns=False,
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=dataset["train"],
    tokenizer=image_processor,
)

trainer.train()

In [None]:
from transformers import pipeline
from PIL import Image

path = '/usr/users/henrich1/exercises_summer_school/data/object_detection/images/cam1_120180328-194558-1522259158_frame_27590.jpg'
image = Image.open(path)

obj_detector = pipeline("object-detection", model="/usr/users/henrich1/exercises_summer_school/exercises/object_detection/yolo_ft/checkpoint-800")
obj_detector(image)

In [None]:
inputs = image_processor(images=image, return_tensors="pt").to('cuda')

In [None]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection, YolosImageProcessor
import torch

image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
model = AutoModelForObjectDetection.from_pretrained("/usr/users/henrich1/exercises_summer_school/exercises/object_detection/detr-resnet-50_finetuned_pigs/checkpoint-2600").to('cuda')

# image_path = '/usr/users/henrich1/exercises_summer_school/data/object_detection/images/Kamera320170324-103300-1490347980_frame_15038.jpg'
# image_path = '/usr/users/henrich1/exercises_summer_school/data/object_detection/images/Kamera120171025-115501-1508925301_frame_16598.jpg'
# image_path = '/usr/users/henrich1/exercises_summer_school/data/object_detection/images/Kamera320180307-184059-1520444459_cropped_00-08-43_00-10-35_frame68_missing.jpg'
image_path = '/usr/users/henrich1/exercises_summer_school/data/object_detection/images/Kamera420170722-140519-1500725119_cropped_00-26-34_00-26-47_frame11_lowconf.jpg'
image = Image.open(image_path)

with torch.no_grad():
    inputs = image_processor(images=image, return_tensors="pt").to('cuda')
    outputs = model(**inputs)
    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

In [None]:
from PIL import ImageDraw

In [None]:
draw = ImageDraw.Draw(image)

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    x, y, x2, y2 = tuple(box)
    draw.rectangle((x, y, x2, y2), outline="red", width=1)
    draw.text((x, y), model.config.id2label[label.item()], fill="white")

image

In [None]:
# from safetensors import safe_open


# test = torch.load('/usr/users/henrich1/exercises_summer_school/exercises/object_detection/detr-resnet-50_finetuned_pigs/checkpoint-2750/rng_state.pth')

# tensors = {}
# with safe_open("/usr/users/henrich1/exercises_summer_school/exercises/object_detection/detr-resnet-50_finetuned_pigs/checkpoint-2750/model.safetensors", framework="pt", device=0) as f:
#     for k in f.keys():
#         tensors[k] = f.get_tensor(k)

# tensors.keys()