# Oject Detection with transformer

Tutorial from : https://huggingface.co/docs/transformers/tasks/object_detection

In [None]:
# install related libraries
!pip install -q datasets transformers evaluate timm albumentations

# Import libraries 

ที่ใช้ในการทำงาน เราจะใช้ `transformers` และ `datasets` มาช่วยในการทำงาน จาก huggingface hub

In [None]:
from transformers import AutoModelForObjectDetection 
from transformers import AutoImageProcessor
from transformers import TrainingArguments
from transformers import Trainer
from datasets import load_dataset
import torch
import torchvision
import evaluate
from tqdm import tqdm
import albumentations
from PIL import Image, ImageDraw
import requests
import numpy as np
import json
import os

# Preparing data

Download `cppe-5` dataset เพื่อใช้ในการ train model และ test model 

In [None]:
cppe5 = load_dataset("cppe-5")
cppe5

สร้าง `id2label` และ `label2id` ของ dataset ที่ใช้

In [None]:
categories = cppe5["train"].features["objects"].feature["category"].names 
id2label = {index: x for index, x in enumerate(categories, start=0)} 
label2id = {v: k for k, v in id2label.items()} 

In [None]:
# remove images with no objects
remove_idx = [590, 821, 822, 875, 876, 878, 879] 
keep = [i for i in range(len(cppe5["train"])) if i not in remove_idx]
cppe5["train"] = cppe5["train"].select(keep)

Transform data 

`trasnfrom` data ก่อนที่จะเข้าไปใช้ใน model โดยใช้ `albumentations` ในการทำ data augmentation

In [None]:
transform = albumentations.Compose(
    [
        albumentations.Resize(480, 480), # resize to 480x480
        albumentations.HorizontalFlip(p=1.0), # horizontal flip
        albumentations.RandomBrightnessContrast(p=1.0), # random brightness and contrast
    ],
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)

define function สำหรับการ train model

In [None]:
# formatted annotations
def formatted_anns(image_id, category, area, bbox): 
    annotations = []
    for i in range(0, len(category)):
        new_ann = {
            "image_id": image_id,
            "category_id": category[i],
            "isCrowd": 0,
            "area": area[i],
            "bbox": list(bbox[i]),
        }
        annotations.append(new_ann)

    return annotations

# transforming a batch
def transform_aug_ann(examples):
    image_ids = examples["image_id"]
    images, bboxes, area, categories = [], [], [], []
    for image, objects in zip(examples["image"], examples["objects"]):
        image = np.array(image.convert("RGB"))[:, :, ::-1]
        out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])

        area.append(objects["area"])
        images.append(out["image"])
        bboxes.append(out["bboxes"])
        categories.append(out["category"])

    targets = [
        {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
        for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
    ]

    return image_processor(images=images, annotations=targets, return_tensors="pt")

def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    batch["pixel_mask"] = encoding["pixel_mask"]
    batch["labels"] = labels
    return batch

In [None]:
# trainfrom cppe5 model with augmentation
cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann)

# Train model

In [None]:
# เลือก pretrain_model และ image processor ที่จะใช้
checkpoint = "facebook/detr-resnet-50" 
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [None]:
# สร้าง model จาก pretrain_model ที่เลือก
model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

กำหนด `training_args` และ `trainer` สำหรับการเทรนโมเดล

In [None]:
# [optional] disable wandb ที่ใช้ในการ log ข้อมูล
os.environ["WANDB_DISABLED"] = "true"

In [None]:
training_args = TrainingArguments(
    output_dir="{directory_name}", # ชื่อโฟลเดอร์ที่เราจะเก็บ model ที่ train ได้
    per_device_train_batch_size=2, # จำนวน batch size 
    num_train_epochs=10,  # จำนวน epoch ที่เราต้องการให้โมเดล train
    fp16=True, # ใช้ mixed precision หรือไม่
    save_steps=200, # จำนวน step ที่เราต้องการให้โมเดล save
    logging_steps=50, 
    learning_rate=1e-5, # ค่า learning rate
    weight_decay=1e-4, # ค่า weight decay
    save_total_limit=2, # จำนวน model ที่เราต้องการให้โมเดล save
    remove_unused_columns=False, 
    push_to_hub=False, 
)

In [None]:
# สร้าง trainer จาก model ที่เราสร้างไว้
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=cppe5["train"],
    tokenizer=image_processor,
)

In [None]:
trainer.train() # train model

# Evaluation

ทดสอบ model หรือ `evaluation` เพื่อเช็คผลลัพธ์ของโมเดล 

In [None]:
class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, feature_extractor, ann_file):
        super().__init__(img_folder, ann_file)
        self.feature_extractor = feature_extractor

    def __getitem__(self, idx):
        # read in PIL image and target in COCO format
        img, target = super(CocoDetection, self).__getitem__(idx)

        # preprocess image and target: converting target to DETR format,
        # resizing + normalization of both image and target
        image_id = self.ids[idx]
        target = {"image_id": image_id, "annotations": target}
        encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
        target = encoding["labels"][0]  # remove batch dimension

        return {"pixel_values": pixel_values, "labels": target}


# format annotations the same as for training, no need for data augmentation
def val_formatted_anns(image_id, objects):
    annotations = []
    for i in range(0, len(objects["id"])):
        new_ann = {
            "id": objects["id"][i],
            "category_id": objects["category"][i],
            "iscrowd": 0,
            "image_id": image_id,
            "area": objects["area"][i],
            "bbox": objects["bbox"][i],
        }
        annotations.append(new_ann)

    return annotations


# Save images and annotations into the files torchvision.datasets.CocoDetection expects
def save_cppe5_annotation_file_images(cppe5):
    output_json = {}
    path_output_cppe5 = f"{os.getcwd()}/cppe5/"

    if not os.path.exists(path_output_cppe5):
        os.makedirs(path_output_cppe5)

    path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json")
    categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label]
    output_json["images"] = []
    output_json["annotations"] = []
    for example in cppe5:
        ann = val_formatted_anns(example["image_id"], example["objects"])
        output_json["images"].append(
            {
                "id": example["image_id"],
                "width": example["image"].width,
                "height": example["image"].height,
                "file_name": f"{example['image_id']}.png",
            }
        )
        output_json["annotations"].extend(ann)
    output_json["categories"] = categories_json

    with open(path_anno, "w") as file:
        json.dump(output_json, file, ensure_ascii=False, indent=4)

    for im, img_id in zip(cppe5["image"], cppe5["image_id"]):
        path_img = os.path.join(path_output_cppe5, f"{img_id}.png")
        im.save(path_img)

    return path_output_cppe5, path_anno

เลือก `model` ที่เราใช้ในการ evaluate

In [None]:
model_path = "{directory_name}/{model}" # ใส่ directory ของ model ที่เราเทรนไว้

In [None]:
# เลือก processor จาก model ที่เรา train ไว้
im_processor = AutoImageProcessor.from_pretrained(f"{model_path}/preprocessor_config.json", local_files_only=True)
path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)

# เลือก model ที่เรา train ไว้
model = AutoModelForObjectDetection.from_pretrained(f"{model_path}", local_files_only=True)
module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
val_dataloader = torch.utils.data.DataLoader(
    test_ds_coco_format, batch_size=2, shuffle=False, num_workers=0, collate_fn=collate_fn
)

# prediction และ evaluation
with torch.no_grad():
    for idx, batch in enumerate(tqdm(val_dataloader)):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]

        labels = [
            {k: v for k, v in t.items()} for t in batch["labels"]
        ]  # these are in DETR format, resized + normalized

        # forward pass
        outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

        orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
        results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to COCO api

        module.add(prediction=results, reference=labels)
        del batch

results = module.compute()

# แสดงผล evaluation
print(results)

# Inference

Prediction `inference` โดยใช้ `model` ที่เรา train ไว้

In [None]:
# เลือกรูปที่ต้องการ predict จาก url
url = "https://i.imgur.com/2lnWoly.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
model_path = "{directory_name}/{model}" # ใส่ directory ของ model ที่เราเทรนไว้

In [None]:
image_processor = AutoImageProcessor.from_pretrained(f"{model_path}/preprocessor_config.json", local_files_only=True)  # processor 
model = AutoModelForObjectDetection.from_pretrained(f"{model_path}", local_files_only=True) # model 

# predict image
with torch.no_grad():
    inputs = image_processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0] # ใส่ threshold ที่เหมาะสม 

`thesold` เราสามารถเปลี่ยนค่าได้ตามความเหมาะสม เนื่องจากในบาง model ที่เราเทรนไว้ จะมีค่า confidence ของการ `prediction` ที่ต่างกัน

In [None]:
# visualize results
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

Visualize prediction โดยใช้ `ImageDraw` 

In [None]:
draw = ImageDraw.Draw(image)

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    x, y, x2, y2 = tuple(box)
    draw.rectangle((x, y, x2, y2), outline="red", width=1)
    draw.text((x, y), model.config.id2label[label.item()], fill="white")

image