In [None]:
# IMPORTANT: Make sure you're using a GPU runtime!

In [None]:
# Based on this notebook: https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb

In [None]:
!git clone https://github.com/openai/CLIP
%cd CLIP

In [None]:
# This is SUPER hacky because I don't know a better way (that's quick). Basically the vision model is ready to export as-is, like this:
#   torch.onnx.export(model.vision, ...)
# but the text model has a couple of pre-processing steps (like converting tokens to embeddings), and I'd like to have all that
# processing contained within the onnx file for the text encoder. The `torch.onnx.export` function seems to only be able to
# take a *model* as an input, and not a function (like `model.encode_text`), so I'm hackily renaming `model.encode_text` to
# `model.forward` so that I can then write:
#   torch.onnx.export(model, ...)
# to export the text encoder. I'm sure there's a much better way to do this. If this stops working, note that
# it was working at the following commit hash, so you can clone this to get it working: https://github.com/openai/CLIP/tree/573315e83f07b53a61ff5098757e8fc885f1703e
!sed -i -e 's/def forward(self, image, text):/def old_forward(self, image, text):/g' ./clip/model.py
!sed -i -e 's/def encode_text(self, text):/def forward(self, text):/g' ./clip/model.py

In [None]:
! pip install ftfy regex tqdm

In [None]:
import numpy as np
import torch
import clip

clip.available_models()

In [None]:
model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

In [None]:
clip.tokenize("Hello World!")

In [13]:
import os
import skimage
from PIL import Image
import numpy as np

from collections import OrderedDict
import torch

# images in skimage to use and their textual descriptions
descriptions = {
    "astronaut": "a portrait of an astronaut with the American flag",
}

In [14]:
original_images = []
images = []
texts = []

for filename in [filename for filename in os.listdir(skimage.data_dir) if filename.endswith(".png") or filename.endswith(".jpg")]:
    name = os.path.splitext(filename)[0]
    if name not in descriptions:
        continue

    image = Image.open(os.path.join(skimage.data_dir, filename)).convert("RGB")
    original_images.append(image)
    images.append(preprocess(image))
    texts.append(descriptions[name])

In [15]:
image_input = torch.tensor(np.stack(images)).half().cuda()
text_tokens = clip.tokenize(["This is " + desc for desc in texts]).cuda()

In [None]:
model.visual(image_input)[0] # astronaut pic embedding

In [None]:
model(text_tokens)[0] # astronaut text embedding

In [None]:
torch.onnx.export(model, text_tokens, "clip-text-vit-32.onnx", export_params=True, opset_version=12, do_constant_folding=True, input_names = ['input'], output_names = ['output'], dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}})

In [None]:
torch.onnx.export(model.visual, image_input, "clip-image-vit-32.onnx", export_params=True, opset_version=12, do_constant_folding=True, input_names = ['input'], output_names = ['output'], dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}})

In [None]:
# use this option in the above torch.onnx.export calls if you get a "Unable to cast from non-held to held instance (T& to Holder<T>)" error:
#   operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK

In [None]:
# The onnx model files are now in the /content/CLIP directory.

In [None]:
# The code below is for converting to tflite, tfjs and tf saved model:

In [None]:
!pip install git+https://github.com/onnx/onnx-tensorflow.git

In [None]:
!onnx-tf convert -i clip-image-vit-32.onnx -o clip-image-vit-32-tf
!onnx-tf convert -i clip-text-vit-32.onnx -o clip-text-vit-32-tf

In [None]:
!pip install tensorflowjs

In [None]:
!tensorflowjs_converter --input_format tf_saved_model ./clip-image-vit-32-tf ./clip-image-vit-32-tfjs
!tensorflowjs_converter --input_format tf_saved_model ./clip-text-vit-32-tf ./clip-text-vit-32-tfjs

In [None]:
import tensorflow as tf

# image encoder:
converter = tf.lite.TFLiteConverter.from_saved_model("./clip-image-vit-32-tf")
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844
tflite_model = converter.convert()
with open('clip-image-vit-32.tflite', 'wb') as f:
  f.write(tflite_model)

# text encoder:
converter = tf.lite.TFLiteConverter.from_saved_model("./clip-text-vit-32-tf")
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844
tflite_model = converter.convert()
with open('clip-text-vit-32.tflite', 'wb') as f:
  f.write(tflite_model)