# Speed Test of Open CLIP models

In [17]:
"""
import packets
"""
import torch
from PIL import Image
import open_clip as clip
from utils.benchmark_utils import track_infer_time, print_timings
import huggingface_hub


In [18]:
"""
Check the available pretrained models from OPEN CLIP
"""
clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN50-quickgelu', 'openai'),
 ('RN50-quickgelu', 'yfcc15m'),
 ('RN50-quickgelu', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN101-quickgelu', 'openai'),
 ('RN101-quickgelu', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_xl_s13b_b90k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_

In [19]:
"""
We shall define our parameters, choose from the available ones
"""
model_arch = 'ViT-L-14'
pretrained = 'laion2b_s32b_b82k'

# common parameters
batch_size = 1
device = 'cuda' # use 'cpu' if you do not have NVIDIA GPU, will research on how to use Intel/AMD GPUs ('ipex','rocm') if we want to test
warmup = 10
n = 100


In [20]:
# model loader
def prepare_pytorch_model(model_arch, pretrained, device):
    pt_model,_,preprocess = clip.create_model_and_transforms(model_arch,pretrained=pretrained)
    tokenizer = clip.get_tokenizer(model_arch)
    if device == "cuda":
        pt_model.cuda()
    return pt_model,preprocess,tokenizer

In [21]:
# Speed test preparation work
print(f"Prepare the Pytorch model from {model_arch} {pretrained}")
pt_model,preprocess,tokenizer = prepare_pytorch_model(model_arch, pretrained, device)
im = Image.open("test-L.jpg")
print(im.size)
image = preprocess(im).unsqueeze(0) #image from http://images.cocodataset.org/val2017/000000039769.jpg, rename to 'CLIP.jpg'
text = tokenizer(["a diagram", "a dog", "a cat","a bus","a road","a car"])
if device == "cuda":
    image = image.cuda()
    text = text.cuda()


Prepare the Pytorch model from ViT-L-14 laion2b_s32b_b82k
(5749, 3234)


In [22]:
# test the image feature extraction
print("Begin the image feature extraction speed test...")

for i in range(warmup):
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = pt_model.encode_image(image)
        image_features /= image_features.norm(dim=-1, keepdim=True)
print("Forward the Pytorch image model...")
time_buffer = list()
for i in range(n):
    with track_infer_time(time_buffer):
        with torch.no_grad(), torch.cuda.amp.autocast():
            image_features = pt_model.encode_image(image)
            image_features /= image_features.norm(dim=-1, keepdim=True)
print_timings(name=f"Pytorch image inference speed (batch-size: {batch_size}):", timings=time_buffer)

Begin the image feature extraction speed test...
Forward the Pytorch image model...
[Pytorch image inference speed (batch-size: 1):] mean=16.49ms, sd=1.50ms, min=8.11ms, max=18.81ms, median=17.13ms, 95p=17.63ms, 99p=17.87ms


In [23]:
# test the image feature extraction
print("Begin the text feature extraction speed test...")

for i in range(warmup):
    with torch.no_grad(), torch.cuda.amp.autocast():
        text_features = pt_model.encode_text(text)
        text_features /= text_features.norm(dim=-1, keepdim=True)
print("Forward the Pytorch text model...")
time_buffer = list()
for i in range(n):
    with track_infer_time(time_buffer):
        with torch.no_grad(), torch.cuda.amp.autocast():
            text_features = pt_model.encode_text(text)
            text_features /= text_features.norm(dim=-1, keepdim=True)
print_timings(name=f"Pytorch text inference speed (batch-size: {batch_size}):", timings=time_buffer)

Begin the text feature extraction speed test...
Forward the Pytorch text model...
[Pytorch text inference speed (batch-size: 1):] mean=8.08ms, sd=0.09ms, min=7.90ms, max=8.39ms, median=8.06ms, 95p=8.21ms, 99p=8.39ms


In [24]:
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]


Label probs: tensor([[1.4305e-06, 1.4901e-06, 3.2187e-05, 1.0000e+00, 2.1756e-05, 5.2273e-05]],
       device='cuda:0', dtype=torch.float16)
