In [48]:
from cn_clip.clip import tokenize, image_transform
from cn_clip.clip.utils import _MODEL_INFO
import onnxruntime
import os
from pathlib import Path
import numpy as np
from PIL import Image
import torch

## 文字编码

In [8]:
text_sess_options = onnxruntime.SessionOptions()
text_run_options = onnxruntime.RunOptions()
text_run_options.log_severity_level = 2
text_onnx_model_path = Path('.') / 'data_path' / 'txt.fp32.onnx'
text_session = onnxruntime.InferenceSession(text_onnx_model_path.absolute(), sess_options=text_sess_options)

In [77]:
r = text_session.run(None, { 'text': tokenize('皮球').numpy() })
text_vector = r[0]
text_vector = text_vector / np.linalg.norm(text_vector[0, :])
text_features = torch.tensor(r[0])
text_features = text_features / text_features.norm(dim=-1, keepdim=True) # 归一化后的Chinese-CLIP文本特征，用于下游任务

In [10]:
np.savetxt('text-result.text', r[0], fmt='%.8f')

## 图像编码

In [12]:
img_sess_options = onnxruntime.SessionOptions()
img_run_options = onnxruntime.RunOptions()
img_run_options.log_severity_level = 2
img_onnx_model_path = Path('.') / 'data_path' / 'img.fp32.onnx'
img_session = onnxruntime.InferenceSession(img_onnx_model_path.absolute(), sess_options=img_sess_options)

In [13]:
model_arch = "ViT-B-16"
preprocess = image_transform(_MODEL_INFO[model_arch]['input_resolution'])
# 示例皮卡丘图片，预处理后得到[1, 3, 分辨率, 分辨率]尺寸的Torch Tensor
image = preprocess(Image.open("examples/pokemon.jpeg")).unsqueeze(0)

In [14]:
img_res = img_session.run(None, { "image": image.numpy() })

In [68]:
image_vector = img_res[0]
image_vector = image_vector / np.linalg.norm(image_vector[0, :])
image_features = torch.tensor(img_res[0])
image_features /= image_features.norm(dim=-1, keepdim=True) # 归一化后的Chinese-CLIP图像特征，用于下游任务

In [47]:
np.savetxt('image-result.text', img_res[0], fmt='%.8f')

In [17]:
_MODEL_INFO

{'ViT-B-16': {'struct': 'ViT-B-16@RoBERTa-wwm-ext-base-chinese',
  'input_resolution': 224},
 'ViT-L-14': {'struct': 'ViT-L-14@RoBERTa-wwm-ext-base-chinese',
  'input_resolution': 224},
 'ViT-L-14-336': {'struct': 'ViT-L-14-336@RoBERTa-wwm-ext-base-chinese',
  'input_resolution': 336},
 'ViT-H-14': {'struct': 'ViT-H-14@RoBERTa-wwm-ext-large-chinese',
  'input_resolution': 224},
 'RN50': {'struct': 'RN50@RBT3-chinese', 'input_resolution': 224}}

In [78]:
image_features @ text_features.t()

tensor([[0.3681]])

In [79]:
np.dot(image_vector, text_vector.T)

array([[0.36813587]], dtype=float32)