In [1]:
from tqdm import tqdm
from PIL import Image
import pandas as pd
import numpy as np
import open_clip
import torch
import h5py
import glob

target = 'hf-hub:laion/CLIP-ViT-B-16-laion2B-s34B-b88K'
device = torch.device('cuda:0')

Load model, preprocess, and tokenizer.

In [2]:
model, _, preprocess = open_clip.create_model_and_transforms(target)
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (patchnorm_pre_ln): Identity()
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((7

Get a list of all images.

In [3]:
fnames = glob.glob('data/unsplash-512/*.png')
len(fnames)

24999

Loop over batches of 32 images and generate embeddings.

In [4]:
BATCH_SIZE = 32
image_features = []
for i in tqdm(range(0, len(fnames), BATCH_SIZE)):
    image_batch = torch.tensor(np.stack([preprocess(Image.open(fname)) for fname in fnames[i:i+BATCH_SIZE]]))
    image_batch_features = model.encode_image(image_batch.to(device)).float()
    image_features.append(image_batch_features.cpu().detach().numpy())
    del image_batch
    torch.cuda.empty_cache()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 782/782 [07:26<00:00,  1.75it/s]


Normalize. Not strictly necessary for COSINE similarity, but this will allow us to experiment with different similarity/distance metrics.

In [5]:
image_features = np.vstack(image_features)
image_features /= np.linalg.norm(image_features, axis=1, keepdims=True)

Persist to hdf5 file. Import to keep vectors and associated filenames aligned.

In [6]:
with h5py.File('data/unsplash-25k.hdf5', 'w') as f:
    group = f.create_group('unsplash-512')
    group.create_dataset('vectors', data=image_features)
    group.create_dataset('fnames', data=fnames)