In [None]:
# https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirementsto
# https://developer.nvidia.com/rdp/cudnn-archive
# https://medium.com/@Rahul_Meduri/install-cuda-cudnn-in-conda-virtual-environment-and-setup-gpu-support-using-tensorflow-f8a4c942b6ea

# https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models
# https://huggingface.co/google/siglip-base-patch16-224/tree/main
# https://huggingface.co/docs/transformers/main/model_doc/siglip#transformers.SiglipTextModel
# https://huggingface.co/blog/convert-transformers-to-onnx

In [2]:
import os
from dotenv import load_dotenv
import requests
import numpy as np
import torch
import onnxruntime
from PIL import Image
from transformers import (
    AutoProcessor,
    AutoTokenizer,
    SiglipModel,
    SiglipVisionModel,
    SiglipTextModel,
)
from huggingface_hub import HfApi, login

load_dotenv()

hf_key = os.getenv("HF_KEY")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# get the image from the url
headers = {"Authorization": f"Bearer {hf_key}"}
API_URL = "https://datasets-server.huggingface.co/rows?dataset=huggan/wikiart&config=default&split=train&offset=1&length=100"


def query():
    response = requests.get(API_URL, headers=headers)
    return response.json()


data = query()
len(data["rows"])  # 100 images

100

In [3]:
# load the model and the tokenizer
model_org = SiglipModel.from_pretrained("nielsr/siglip-base-patch16-224").to(device)
processor = AutoProcessor.from_pretrained("nielsr/siglip-base-patch16-224")
tokenizer = AutoTokenizer.from_pretrained("nielsr/siglip-base-patch16-224")



### Siglip Vision

In [50]:
url = data["rows"][0]["row"]["image"]["src"]
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(images=image, return_tensors="pt")

vision_model = SiglipVisionModel.from_pretrained("nielsr/siglip-base-patch16-224")
torch_out = vision_model(**inputs)

In [51]:
torch.onnx.export(
    vision_model,
    tuple(inputs.values())[0],
    f="siglip_vision.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
    do_constant_folding=True,
    opset_version=13,
)

  if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
  if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):


In [18]:
ort_session = onnxruntime.InferenceSession(
    "siglip_vision.onnx",
    providers=[
        "CUDAExecutionProvider",
        "CPUExecutionProvider",
    ],
)


def to_numpy(tensor):
    return (
        tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
    )

In [19]:
import time

with torch.no_grad():
    url = data["rows"][0]["row"]["image"]["src"]
    image = Image.open(requests.get(url, stream=True).raw)
    inputs = processor(images=image, return_tensors="pt")

    start = time.time()
    torch_out = model_org.get_image_features(**inputs.to(device))
    end = time.time()
    print(f"Inference of Pytorch model used {end - start} seconds")

    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(tuple(inputs.values())[0])}
    start = time.time()
    ort_outs = ort_session.run(None, ort_inputs)
    end = time.time()
    print(f"Inference of ONNX model used {end - start} seconds")

Inference of Pytorch model used 0.021751880645751953 seconds
Inference of ONNX model used 0.03810739517211914 seconds


In [None]:
# time of inference onnx model is higher than pytorch model on GPU a bit
# but on CPU it is faster
# i don't know why :v
# maybe torch 2.0 optimize the model better than onnxruntime

In [19]:
np.mean(ort_outs[1] - torch_out.detach().cpu().numpy())

2.1876378e-09

In [3]:
login(hf_key)

api = HfApi()
api.upload_file(
    path_or_fileobj="siglip_vision.onnx",
    path_in_repo="siglip_vision.onnx",
    repo_id="hieuGoku/siglip_onnx",
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/hieugn/.cache/huggingface/token
Login successful


siglip_vision.onnx: 100%|██████████| 372M/372M [00:41<00:00, 8.89MB/s]  


CommitInfo(commit_url='https://huggingface.co/hieuGoku/siglip_onnx/commit/3d4e92c95a038ac1b5c03e24af0cfff2b2641865', commit_message='Upload siglip_vision.onnx with huggingface_hub', commit_description='', oid='3d4e92c95a038ac1b5c03e24af0cfff2b2641865', pr_url=None, pr_revision=None, pr_num=None)

### Siglip Text

In [4]:
prompt = "a woman"
text_token = tokenizer([prompt], return_tensors="pt")
text_model = SiglipTextModel.from_pretrained("nielsr/siglip-base-patch16-224")
torch_out = text_model(**text_token)

In [5]:
torch.onnx.export(
    text_model,
    tuple(text_token.values())[0],
    f="siglip_text.onnx",
    input_names=["input_ids"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence"},
        "attention_mask": {0: "batch_size", 1: "sequence"},
        "output": {0: "batch_size", 1: "sequence"},
    },
    do_constant_folding=True,
    opset_version=13,
)

  if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
  if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):


In [60]:
ort_session = onnxruntime.InferenceSession(
    "siglip_text.onnx", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)


def to_numpy(tensor):
    return (
        tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
    )

In [40]:
import time

prompt = "a woman"
text_token = tokenizer([prompt], return_tensors="pt")

start = time.time()
text_features = model_org.get_text_features(**text_token.to(device))
end = time.time()
print(f"Inference of Pytorch model used {end - start} seconds")

ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(tuple(text_token.values())[0])}
start = time.time()
ort_outs = ort_session.run(None, ort_inputs)
end = time.time()
print(f"Inference of ONNX model used {end - start} seconds")

Inference of Pytorch model used 0.023141860961914062 seconds
Inference of ONNX model used 0.0066034793853759766 seconds


In [41]:
np.mean(ort_outs[1] - text_features.detach().cpu().numpy())

-3.3614924e-09

In [4]:
api.upload_file(
    path_or_fileobj="siglip_text.onnx",
    path_in_repo="siglip_text.onnx",
    repo_id="hieuGoku/siglip_onnx",
)

siglip_text.onnx: 100%|██████████| 441M/441M [00:41<00:00, 10.6MB/s] 


CommitInfo(commit_url='https://huggingface.co/hieuGoku/siglip_onnx/commit/e1e7da6d2f08927e4ad6bd3d3c133e3ecd40b258', commit_message='Upload siglip_text.onnx with huggingface_hub', commit_description='', oid='e1e7da6d2f08927e4ad6bd3d3c133e3ecd40b258', pr_url=None, pr_revision=None, pr_num=None)