In [1]:
import pandas as pd
import torch, clip
import numpy as np

from pathlib import Path
from tqdm import tqdm
from PIL import Image

from torchvision import transforms

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
setting_name = '32' # '32' for ViT-B/32, '14' for ViT-L/14

In [2]:
# for copyright purposes, the iamges are not provided
# in this repository
data_path = Path('path/to/images/folder')

image_to_encode = list(data_path.glob('*.jpg'))
print(f'Number of images: {len(image_to_encode)}')

Number of images: 11750


In [9]:
embeddings = []
names = []
image_to_encode = image_to_encode
batch_size = 128
for fs in tqdm([image_to_encode[i:i+batch_size] for i in range(0, len(image_to_encode), batch_size)]):
    images = [Image.open(f) for f in fs]
    image_input = torch.stack([preprocess(image) for image in images]).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image_input)
    embeddings.append(image_features)
    # modify the following line if a different naming convention is used
    names.extend([f.stem for f in fs]) 
embeddings = torch.cat(embeddings).numpy()

100%|██████████| 10/10 [00:22<00:00,  2.25s/it]


In [30]:
import pickle
with open(data_path.parent / f'embeddings/txt_emb_{setting_name}.pkl', 'wb') as f:
    pickle.dump({name: emb.squeeze() for name, emb in zip(names, embeddings)}, f)