In [1]:
# from fashion_clip.fashion_clip import FashionCLIP
from os import path as osp
from PIL import Image, ImageOps
import numpy as np
import torch
import onnx
import onnxruntime as ort
from transformers import (
    CLIPProcessor, CLIPTokenizerFast, CLIPImageProcessor,
    CLIPModel,
    CLIPTextModelWithProjection,
    CLIPVisionModelWithProjection
)
from src.utils import get_project_root

PROJECT_ROOT_PATH = get_project_root()

### Export model to ONNX

In [6]:
# Vision CLIP
clip_vision_model = CLIPVisionModelWithProjection.from_pretrained('patrickjohncyh/fashion-clip')
clip_image_processor = CLIPImageProcessor()
inputs = clip_image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0].unsqueeze(0)
output = clip_vision_model(inputs)['image_embeds']

# Export Vision CLIP model to ONNX
torch.onnx.export(
    model=clip_vision_model,
    args=inputs,
    f='../tmp/fashion_clip_image.onnx',
    input_names=['input'],
)

In [9]:
# Load ONNX model and test
ort_sess = ort.InferenceSession('../tmp/fashion_clip_image.onnx', providers=['CUDAExecutionProvider'])
img_embed = ort_sess.run(None, {'input': torch.Tensor.numpy(inputs)})[0]

In [23]:
# Text CLIP
clip_text_model = CLIPTextModelWithProjection.from_pretrained('patrickjohncyh/fashion-clip')
tokenizer = CLIPTokenizerFast.from_pretrained('patrickjohncyh/fashion-clip')
texts = ['Short Dress', 'Long Dress']
inputs = tokenizer(text=texts, return_tensors='pt')
output = clip_text_model(
    input_ids=inputs['input_ids'],
    # attention_mask=inputs['attention_mask']
)

torch.onnx.export(
    model=clip_text_model,
    args=(inputs['input_ids']),
    f='../tmp/fashion_clip_text.onnx',
    input_names=['input']
)

  if input_shape[-1] > 1 or self.sliding_window is not None:
  if past_key_values_length > 0:


In [25]:
output.text_embeds

tensor([[-0.0753,  0.1137, -0.2012,  ...,  0.2871, -0.1401, -0.1745],
        [ 0.1993,  0.0851, -0.2964,  ..., -0.0155, -0.2383, -0.0200]],
       grad_fn=<MmBackward0>)

In [24]:
to_numpy = lambda x: torch.Tensor.numpy(x)
ort_sess = ort.InferenceSession('../tmp/fashion_clip_text.onnx', provider=['CUDAExecutionProvider'])
ort_sess.run(None, {'input': to_numpy(inputs['input_ids'])})

[array([[-0.07526917,  0.1137128 , -0.20121977, ...,  0.2870652 ,
         -0.14005005, -0.17451927],
        [ 0.19929732,  0.08510898, -0.29637247, ..., -0.0154532 ,
         -0.23826578, -0.02000891]], dtype=float32),
 array([[[ 0.08299014,  0.06903712,  0.3631887 , ..., -0.0869887 ,
           0.22818953,  0.47101068],
         [ 1.97926   ,  0.32932737,  1.1483428 , ..., -1.5879972 ,
           0.7777781 , -0.25711507],
         [ 2.6857781 ,  0.8550246 ,  1.7345062 , ..., -0.7738764 ,
          -0.6795901 , -0.28669494],
         [ 1.0880939 ,  1.0650144 ,  0.52067816, ..., -2.678224  ,
          -0.13050346, -1.3930943 ]],
 
        [[ 0.08299014,  0.06903712,  0.3631887 , ..., -0.0869887 ,
           0.22818953,  0.47101068],
         [-0.64101744, -0.4311428 , -0.48019487, ...,  0.6071063 ,
           0.8231285 , -0.4630533 ],
         [ 2.259399  ,  1.0240291 ,  1.4451071 , ..., -0.05386524,
          -0.10072777, -0.77439076],
         [ 1.8626235 ,  0.7605586 ,  0.63829696,

In [26]:
# Create CLIP-based model and try inference
# model = CLIPModel.from_pretrained('patrickjohncyh/fashion-clip')
# processor = CLIPProcessor.from_pretrained('patrickjohncyh/fashion-clip')
# img = Image.open('../assets/damngan2.png')
# img = ImageOps.fit(img, size=((384, 512)))
# text = ['Short Dress', 'Long Dress']
# inputs = processor(text=text, images=img, return_tensors='pt',)
# outputs = model(**inputs)
# print(inputs.keys())
# print(outputs.keys())
# print(outputs['image_embeds'].shape)

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])
odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])
torch.Size([1, 512])


In [27]:
# torch.onnx.export(
#     model=model,
#     args=(inputs['input_ids'], inputs['pixel_values']),
#     f='../tmp/fashion_clip.onnx',
#     input_names=['input'],
# )

In [None]:
# ort_sess = ort.InferenceSession('../tmp/fashion_clip.onnx', providers=['CUDAExecutionProvider'])
# inputs = processor(text=text, images=img, return_tensors='pt',)
# ort_sess.run(None, {'input': (to_numpy(inputs['input_ids']), to_numpy(inputs['pixel_values']))})

In [None]:
MODEL_PATH = osp.join(PROJECT_ROOT_PATH, 'checkpoints/fashion_clip/model.onnx')

ort_sess = ort.InferenceSession(MODEL_PATH)
processor = CLIPProcessor.from_pretrained('patrickjohncyh/fashion-clip')

In [None]:
# Load image
img = Image.open('../assets/damngan2.png')
img = ImageOps.fit(img, size=((384, 512)))

# Inputs
text = ['short dress', 'long dress']
inputs = processor(text=text, images=img, return_tensors='pt', padding=True)

ort_sess.run(None, inputs)

In [None]:
fclip = FashionCLIP('fashion-clip')

img = Image.open('../assets/damngan2.png')
ImageOps.fit(img, size=((192, 384)))

In [None]:
img_embed = fclip.encode_images([img], batch_size=1)
normalized_img_embed = img_embed / np.sqrt(np.sum(img_embed**2))

In [None]:
types = ['short dress', 'long dress']
types_embed = fclip.encode_text(types, batch_size=2)

In [None]:
norm = np.sqrt(np.sum(types_embed**2, axis=1))
normalized_types_embed = types_embed / np.expand_dims(norm, axis=1)

In [None]:
output = np.dot(normalized_types_embed, np.transpose(img_embed))

In [None]:
idx = np.argmax(output)
if idx == 0:
    print(types[0])
elif idx == 1:
    print(types[1])
else:
    raise ValueError(f'Value {idx} is not supported.')