In [7]:
from typing import List
import io

import torch
from PIL import Image
from transformers import AutoProcessor, Owlv2ForObjectDetection
import numpy as np

class VLMManager:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        self.model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
        self.model.to(self.device)

    def resize(self, img, base_width=380):
        wpercent = (base_width / float(img.size[0]))
        hsize = int((float(img.size[1]) * float(wpercent)))
        img = img.resize((base_width, hsize), Image.Resampling.LANCZOS)
        return img

    def resize_box(self, box, original_size, resized_size):
        x_scale = original_size[0] / resized_size[0]
        y_scale = original_size[1] / resized_size[1]
        resized_box = [
            box[0] * x_scale,
            box[1] * y_scale,
            box[2] * x_scale - box[0] * x_scale,
            box[3] * y_scale - box[1] * y_scale
        ]
        return [int(round(coord, 0)) for coord in resized_box]

    def identify(self, image: bytes, caption: str) -> List[int]:
        image_stream = io.BytesIO(image)
        image = Image.open(image_stream).convert('RGB')
        ori_size = image.size
        resized_image = self.resize(image)
        
        text = [["a photo of " + caption.lower()]]
        
        inputs = self.processor(images=resized_image, text=text, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        target_sizes = torch.Tensor([resized_image.size[::-1]])
        results = self.processor.post_process_object_detection(
            outputs=outputs, threshold=0.2, target_sizes=target_sizes
        )

        i = 0
        boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
        
        x, y = ori_size
        final, best_score = [int(0.25*x), int(0.25*y), int(0.75*x), int(0.75*y)], 0
        for box, score, label in zip(boxes, scores, labels):
            if score.item() > best_score:
                resized_box = self.resize_box(box.tolist(), ori_size, resized_image.size)
                final = resized_box
                best_score = score.item()

        return final

In [8]:
vlm_manager = VLMManager()



with open('image_0.jpg', 'rb') as img_file:
    image_bytes = img_file.read()
caption = "blue and white commercial aircraft"
bounding_box = vlm_manager.identify(image_bytes, caption)
print("Bounding Box:", bounding_box)

Bounding Box: [803, 320, 123, 36]


In [15]:
# from https://gist.github.com/meyerjo/dd3533edc97c81258898f60d8978eddc


from statistics import mean
from typing import List


def bb_iou(bb1: List[int], bb2: List[int]) -> int:
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes in ltwh format.

    Parameters
    ----------
    bb1 : list[int, int, int, int]
        left, top, width, height
    bb2 : list[int, int, int, int]
        left, top, width, height

    Returns
    -------
    int
        0 or 1
    """
    boxA = [bb1[0], bb1[1], bb1[0] + bb1[2], bb1[1] + bb1[3]]
    boxB = [bb2[0], bb2[1], bb2[0] + bb2[2], bb2[1] + bb2[3]]

    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0.0
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the intersection over union value @ 0.5
    return round(iou)


def vlm_eval(bbox_truths: List[List[int]], bbox_predictions: List[List[int]]) -> float:
    return mean(
        bb_iou(bb_truth, bb_pred)
        for bb_truth, bb_pred in zip(bbox_truths, bbox_predictions)
    )

In [18]:
results = [{'key': 0, 'bbox': [380, 217, 760, 435]}, {'key': 1, 'bbox': [803, 320, 123, 36]}, {'key': 2, 'bbox': [803, 320, 123, 36]}, {'key': 3, 'bbox': [1301, 489, 121, 42]}, {'key': 4, 'bbox': [1301, 489, 121, 42]}, {'key': 5, 'bbox': [380, 217, 760, 435]}, {'key': 6, 'bbox': [380, 217, 760, 435]}, {'key': 7, 'bbox': [380, 217, 760, 435]}, {'key': 8, 'bbox': [211, 444, 71, 64]}, {'key': 9, 'bbox': [913, 152, 32, 20]}, {'key': 10, 'bbox': [539, 116, 67, 53]}, {'key': 11, 'bbox': [1106, 110, 54, 32]}, {'key': 12, 'bbox': [380, 217, 760, 435]}, {'key': 13, 'bbox': [380, 217, 760, 435]}, {'key': 14, 'bbox': [446, 175, 53, 25]}]
truths = [{'key': 0, 'caption': 'blue and white missile', 'bbox': [1224, 284, 44, 36]}, {'key': 1, 'caption': 'green light aircraft', 'bbox': [688, 400, 56, 36]}, {'key': 2, 'caption': 'blue and white commercial aircraft', 'bbox': [800, 320, 128, 36]}, {'key': 3, 'caption': 'blue commercial aircraft', 'bbox': [1156, 496, 104, 60]}, {'key': 4, 'caption': 'white and yellow commercial aircraft', 'bbox': [1296, 488, 136, 44]}, {'key': 5, 'caption': 'white and blue fighter jet', 'bbox': [488, 196, 52, 44]}, {'key': 6, 'caption': 'blue and yellow fighter jet', 'bbox': [836, 464, 36, 36]}, {'key': 7, 'caption': 'grey and white fighter plane', 'bbox': [1060, 208, 64, 32]}, {'key': 8, 'caption': 'grey camouflage fighter jet', 'bbox': [212, 444, 72, 64]}, {'key': 9, 'caption': 'grey and black helicopter', 'bbox': [912, 144, 40, 28]}, {'key': 10, 'caption': 'grey commercial aircraft', 'bbox': [536, 116, 72, 52]}, {'key': 11, 'caption': 'red helicopter', 'bbox': [1100, 96, 60, 52]}, {'key': 12, 'caption': 'green and black camouflage helicopter', 'bbox': [1156, 268, 28, 32]}, {'key': 13, 'caption': 'grey and red fighter jet', 'bbox': [412, 352, 108, 56]}, {'key': 14, 'caption': 'black fighter plane', 'bbox': [448, 176, 52, 24]}]
# results = [{'key': 0, 'bbox': [380, 217, 760, 435]}, {'key': 1, 'bbox': [803, 183, 123, 21]}, {'key': 2, 'bbox': [803, 183, 123, 21]}, {'key': 3, 'bbox': [1301, 279, 121, 24]}, {'key': 4, 'bbox': [1301, 279, 121, 24]}, {'key': 5, 'bbox': [380, 217, 760, 435]}, {'key': 6, 'bbox': [380, 217, 760, 435]}, {'key': 7, 'bbox': [380, 217, 760, 435]}, {'key': 8, 'bbox': [211, 253, 71, 37]}, {'key': 9, 'bbox': [913, 87, 32, 12]}, {'key': 10, 'bbox': [539, 66, 67, 31]}, {'key': 11, 'bbox': [1106, 63, 54, 18]}, {'key': 12, 'bbox': [380, 217, 760, 435]}, {'key': 13, 'bbox': [380, 217, 760, 435]}, {'key': 14, 'bbox': [446, 100, 53, 14]}]

In [19]:
eval_result = vlm_eval(
        [truth["bbox"] for truth in truths],
        [result["bbox"] for result in results],
    )
eval_result

[0.0, 0.0, 1, 0.0, 1, 0, 0, 0, 1, 1, 1, 1, 0.0, 0, 1]

In [None]:
green light aircraft tensor([200.7495,  45.5879, 231.4284,  50.7849]) tensor(0.4252) tensor(0)
green light aircraft [803, 183, 123, 21]
blue and white commercial aircraft tensor([200.7495,  45.5879, 231.4284,  50.7849]) tensor(0.3439) tensor(0)
blue and white commercial aircraft [803, 183, 123, 21]

In [None]:
green light aircraft tensor([200.7495,  79.8314, 231.4284,  88.9321], device='cuda:0') tensor(0.4252, device='cuda:0') tensor(0, device='cuda:0')
green light aircraft [803, 320, 123, 36]
blue and white commercial aircraft tensor([200.7495,  79.8314, 231.4284,  88.9321], device='cuda:0') tensor(0.3439, device='cuda:0') tensor(0, device='cuda:0')
blue and white commercial aircraft [803, 320, 123, 36]
blue commercial aircraft tensor([297.9743, 124.1211, 303.7122, 137.8791], device='cuda:0') tensor(0.2218, device='cuda:0') tensor(0, device='cuda:0')
blue commercial aircraft [1192, 498, 23, 55]
blue commercial aircraft tensor([325.1395, 122.0269, 355.4868, 132.5076], device='cuda:0') tensor(0.2396, device='cuda:0') tensor(0, device='cuda:0')
blue commercial aircraft [1301, 489, 121, 42]
INFO:     127.0.0.1:53782 - "POST /identify HTTP/1.1" 200 OK
white and yellow commercial aircraft tensor([325.1395, 122.0269, 355.4868, 132.5076], device='cuda:0') tensor(0.2629, device='cuda:0') tensor(0, device='cuda:0')
white and yellow commercial aircraft [1301, 489, 121, 42]
INFO:     127.0.0.1:53796 - "POST /identify HTTP/1.1" 200 OK
grey camouflage fighter jet tensor([134.8471,  28.9070, 151.6177,  42.2461], device='cuda:0') tensor(0.4064, device='cuda:0') tensor(0, device='cuda:0')
grey camouflage fighter jet [539, 116, 67, 53]
grey camouflage fighter jet tensor([ 52.7718, 110.6222,  70.6236, 126.5716], device='cuda:0') tensor(0.4107, device='cuda:0') tensor(0, device='cuda:0')
grey camouflage fighter jet [211, 444, 71, 64]
grey and black helicopter tensor([228.1348,  37.7945, 236.2081,  42.8887], device='cuda:0') tensor(0.3743, device='cuda:0') tensor(0, device='cuda:0')
grey and black helicopter [913, 152, 32, 20]
grey commercial aircraft tensor([134.8471,  28.9070, 151.6177,  42.2461], device='cuda:0') tensor(0.5378, device='cuda:0') tensor(0, device='cuda:0')
grey commercial aircraft [539, 116, 67, 53]
red helicopter tensor([276.5785,  27.3723, 290.0326,  35.3073], device='cuda:0') tensor(0.4996, device='cuda:0') tensor(0, device='cuda:0')
red helicopter [1106, 110, 54, 32]
INFO:     127.0.0.1:42980 - "POST /identify HTTP/1.1" 200 OK
black fighter plane tensor([111.5770,  43.7127, 124.8466,  49.9819], device='cuda:0') tensor(0.2040, device='cuda:0') tensor(0, device='cuda:0')
black fighter plane [446, 175, 53, 25]
INFO:     127.0.0.1:42992 - "POST /identify HTTP/1.1" 200 OK
^CINFO:     Shutting down


In [2]:
from typing import List
import io

import torch
from PIL import Image
from transformers import AutoProcessor, Owlv2ForObjectDetection
import numpy as np

model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

In [3]:
torch.save(model, 'locallysavedmodel.pth')