In [1]:
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import onnxruntime as ort
from transformers import AutoImageProcessor, AutoModel
from safetensors.torch import load_file

## Explore

### Hugging face

In [None]:
# Load preprocessor
processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
processor.crop_size['height'] = 518
processor.crop_size['width'] = 518
processor.size['shortest_edge'] = 518

# Load model
dinov2_hf = AutoModel.from_pretrained('facebook/dinov2-base').eval().cuda() # 346M

In [28]:
# forward a sample image
image = Image.open('/workspace/dataset/samples/truck.jpg')
inputs = processor(images=image, return_tensors="pt")
processed_image_hf = inputs["pixel_values"].cuda()

outputs = dinov2_hf(processed_image_hf)
last_hidden_states = outputs.last_hidden_state  # shape (1, 1370, 768)
vector_rep_hf = last_hidden_states[:, 0, :]     # shape (1, 768)

### Preproccessor

In [None]:
# Parameters from BitImageProcessor
crop_size = (518, 518)
rescale_factor = 1.0 / 255.0
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
shortest_edge_size = 518

def preprocessor(image_path):
    image = Image.open(image_path)
    image = image.convert("RGB")

    # Resize to 256
    width, height = image.size
    if width < height:
        new_width = shortest_edge_size
        new_height = int(shortest_edge_size * height / width)
    else:
        new_height = shortest_edge_size
        new_width = int(shortest_edge_size * width / height)
    image = image.resize((new_width, new_height), Image.BILINEAR)
    
    # Center crop to 224x224
    left = (new_width - crop_size[0]) / 2
    top = (new_height - crop_size[1]) / 2
    right = (new_width + crop_size[0]) / 2
    bottom = (new_height + crop_size[1]) / 2
    image = image.crop((left, top, right, bottom))
    
    # Convert image to numpy array and rescale
    image_array = np.array(image) * rescale_factor
    
    # Normalize
    image_array = (image_array - mean) / std
    return image_array

# Example usage
processed_image = preprocessor('/workspace/dataset/samples/truck.jpg')
processed_image = np.transpose(processed_image, (2, 0, 1))
processed_image = torch.tensor(processed_image).unsqueeze(0).to(torch.float32).cuda()

In [47]:
def compute_similarity(tensor1, tensor2):
    return torch.mean(torch.abs(tensor1 - tensor2))

similarity = compute_similarity(processed_image_hf, processed_image)
print(f"Mean Absolute Difference: {similarity.item()}")

Mean Absolute Difference: 0.06717944145202637


### Github TorchHub

In [None]:
dinov2_vitb14 = torch.hub.load(repo_or_dir='facebookresearch/dinov2', model='dinov2_vitb14').eval().cuda() # 330M
vector_rep_hub = dinov2_vitb14(processed_image)  # shape (1, 768)

In [8]:
# Two outputs are almost similar
def similarity(a:np.ndarray, b:np.ndarray):
    return (a @ b) / (np.linalg.norm(a) * np.linalg.norm(b))

vector_rep_hf_np = vector_rep_hf.detach().cpu().numpy().reshape(-1, )
vector_rep_hub_np = vector_rep_hub.detach().cpu().numpy().reshape(-1, )
similarity(vector_rep_hf_np, vector_rep_hub_np)

0.99993056

### Transformers model

In [2]:
from transformers import Dinov2Model
from safetensors.torch import load_file

# Load the safetensors state_dict
state_dict = load_file("/workspace/assets/models/dinov2/hugging-face/model.safetensors")
model_size = 'base'

# Load the Dinov2Model with the state_dict and config
dinov2_model, matching = Dinov2Model.from_pretrained(
    pretrained_model_name_or_path="/workspace/assets/models/dinov2/hugging-face/",
    state_dict=state_dict,
    output_loading_info=True,
    config="/workspace/assets/configs/dinov2/config.json",
)

In [4]:
dinov2_model = dinov2_model.eval().cuda()
output = dinov2_model(processed_image)
last_hidden_states = output.last_hidden_state # (1, 257, 768)
vector_rep_tr = last_hidden_states[:, 0, :]   # (1, 768)

## Convert to ONNX and OpenVINO

In [3]:
class Wrapper(torch.nn.Module):
    def __init__(self, dinov2_model):
        super().__init__()
        self.dinov2_model = dinov2_model
    def forward(self, tensor):
        return self.dinov2_model(tensor).last_hidden_state[:, 0, :]
    
model = Wrapper(dinov2_model).to('cpu')
inp = torch.randn(1, 3, 224, 224)
output = model(inp)

In [4]:
dynamic_batch = {
    'input'  : {0 : 'batch_size'},
    'output' : {0 : 'batch_size'}
}

torch.onnx.export(
    model, torch.randn(1, 3, 518, 518), 
    "/workspace/assets/models/dinov2/onnx/dinov2.onnx",
    export_params=True, do_constant_folding=True,
    input_names=['input'], output_names=['output'],
    dynamic_axes=dynamic_batch
)

### TensorRT Backend

In [9]:
image = preprocessor('/workspace/dataset/samples/truck.jpg')
image = np.transpose(image, (2, 0, 1))[np.newaxis, ...]
image = np.float32(image)

In [4]:
engine_cache_path = "/workspace/assets/models/dinov2/onnx"
providers = [
    ('TensorrtExecutionProvider', {
        'device_id': 0,
        'trt_max_workspace_size': 21474836480,
        'trt_fp16_enable': False,
        'trt_engine_cache_enable': True,
        'trt_engine_cache_path': engine_cache_path}),
    
    ('CUDAExecutionProvider', {})    
]

dino_onnx = ort.InferenceSession("/workspace/assets/models/dinov2/onnx/dinov2.onnx", providers=providers)
dino_onnx_input = {"input": image}
out_embedding = dino_onnx.run(None, dino_onnx_input)[0]

In [11]:
%%time
out_embedding = dino_onnx.run(None, dino_onnx_input)[0]

CPU times: user 130 ms, sys: 3.11 ms, total: 133 ms
Wall time: 129 ms
