In [27]:
import pandas as pd
import torch, clip
import numpy as np

from tqdm import tqdm
from PIL import Image
from pathlib import Path

from torchvision import transforms

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
setting_name = '32' # '32' for ViT-B/32, '14' for ViT-L/14

In [10]:
# pick a dataset

data_path = Path('experiments/svo_probes/svo_probes.csv')
# data_path = Path('experiments/vl_checklist/vl_checklist.csv')
# data_path = Path('experiments/valse/valse_dataset.csv')

data = pd.read_csv(data_path, index_col=0)

In [11]:
data.head()

Unnamed: 0,pos_caption,neg_caption,pos_image_id,pos_subject,pos_verb,pos_object,neg_verb,pos_matched_tense,neg_matched_tense
3,A car passes another on a street.,a car hits another on a street.,6,car,pass,street,hit,passes,hits
9,A girl is sitting in the meadow.,a girl is standing in the meadow.,18,girl,sit,meadow,stand,sitting,standing
11,A dog sitting in a meadow.,a dog running in a meadow.,22,dog,sit,meadow,run,sitting,running
16,Women trek through the forrest.,women run through the forrest.,32,woman,trek,forest,run,trek,run
17,A lady sits in the sand on the beach.,a lady walks in the sand on the beach.,34,lady,sit,beach,walk,sits,walks


In [20]:
# check if the data has nan
print(data.isna().sum())

pos_caption            0
neg_caption            0
pos_image_id           0
pos_subject            0
pos_verb               0
pos_object             0
neg_verb              21
pos_matched_tense      0
neg_matched_tense    631
dtype: int64


In [21]:
string_columns = data.columns[data.dtypes == 'object']
# gather all the strings in one list
text_to_be_encoded = list(set(data[string_columns].values.flatten()))
text_to_be_encoded = [x for x in text_to_be_encoded if type(x) == str]
print(f'Number of unique strings: {len(text_to_be_encoded)}')

Number of unique strings: 26089


In [28]:
embeddings = []
names = []

for text in tqdm(text_to_be_encoded):
    # check if the text is nan
    if pd.isna(text):
        continue
    names.append(text)
    text = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text)
    embeddings.append(text_features.cpu().numpy())

100%|██████████| 26089/26089 [21:40<00:00, 20.06it/s]


In [30]:
# save the embeddings and names in a npz file
# the path should be the same as the csv but with embeddings.npz

# np.savez(data_path.with_suffix(f'.embeddings.{setting_name}.npz'), embeddings=embeddings, names=names)
# save them as a dictionary using pickle instead 
import pickle
with open(data_path.parent / f'embeddings/txt_emb_{setting_name}.pkl', 'wb') as f:
    pickle.dump({name: emb.squeeze() for name, emb in zip(names, embeddings)}, f)