In [5]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection
import torch
from PIL import Image
import requests
from time import time
import onnxruntime as ort
import onnx
import numpy as np
import cv2

#url = "http://images.cocodataset.org/val2017/000000039769.jpg"
#image = Image.open(requests.get(url, stream=True).raw)
image = Image.open('../i2l-dataset/ball/ball_02.jpg')

image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

t0 = time()
inputs = image_processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
target_sizes = torch.tensor([image.size[::-1]])
TH = 0.5
results = image_processor.post_process_object_detection(outputs, threshold=TH, target_sizes=target_sizes)[0]

#print('outputs', outputs)
#print('results', results)

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

print(f'Time: {time()-t0}')

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Detected kite with confidence 0.927 at location [9.48, 12.52, 217.03, 218.23]
Time: 0.587209939956665


In [None]:

# ====== ONNX ======
OUT = 'yolos.onnx'
image = cv2.imread('test-image.jpg')
orig = image.copy()

# convert the image from BGR to RGB channel ordering and change the
# image from channels last to channels first ordering
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image.transpose((2, 0, 1))
# add the batch dimension, scale the raw pixel intensities to the
# range [0, 1], and convert the image to a floating point tensor
image = np.expand_dims(image, axis=0)
image = image / 255.0
image = torch.FloatTensor(image)
# send the input to the device and pass the it through the network to
# get the detections and predictions
image = image.to('cpu')

x = image.cpu()

t0 = time()
with torch.no_grad():
    model.cpu()
    model.eval()
    torch.onnx.export(model,
                      x,
                      OUT,
                      input_names=['image'],
                      output_names=['boxes', 'labels', 'scores'],
                      do_constant_folding=True,
                      opset_version=12,
                      export_params=True)
    
print(f'Time of ONNX conversion: {time()-t0}')


In [None]:

# ===== predict with ONNX =====

sample = image

onnx_model = onnx.load(OUT)
onnx.checker.check_model(onnx_model)

ort_session = ort.InferenceSession(OUT)

input_name = ort_session.get_inputs()[0].name
label_name = ort_session.get_outputs()[1].name

print([inp.name for inp in ort_session.get_inputs()])
print([out.name for out in ort_session.get_outputs()])

t0 = time()
outputs = ort_session.run(
    ['boxes', 'labels', 'scores'],
    {input_name: np.array(sample)},
)


print(f'ONNX: {time()-t0}')
print(outputs)

t0 = time()
y = model(sample)
print(f'\n\n---\nNormally: {time()-t0}')
print(y)