## Data Processing and Vector Generation using FiftyOne and Pinecone

In [1]:
import fiftyone.zoo as foz
import pinecone
import numpy as np
from pkg_resources import packaging
import torch

from config import PINECONE_KEY

  from tqdm.autonotebook import tqdm


## Loading the data and the model

In [3]:
dataset = foz.load_zoo_dataset("coco-2017", split="train")
model = foz.load_zoo_model("clip-vit-base32-torch")

Downloading split 'train' to '/Users/laurendurivault/fiftyone/coco-2017/train' if necessary
Found annotations at '/Users/laurendurivault/fiftyone/coco-2017/raw/instances_train2017.json'
Downloading images to '/Users/laurendurivault/fiftyone/coco-2017/tmp-download/train2017.zip'
 100% |████|  144.1Gb/144.1Gb [11.1m elapsed, 0s remaining, 228.3Mb/s]      
Extracting images to '/Users/laurendurivault/fiftyone/coco-2017/train/data'
Writing annotations to '/Users/laurendurivault/fiftyone/coco-2017/train/labels.json'
Dataset info written to '/Users/laurendurivault/fiftyone/coco-2017/info.json'
Loading 'coco-2017' split 'train'
 100% |███████████| 118287/118287 [6.9m elapsed, 0s remaining, 340.3 samples/s]      
Dataset 'coco-2017-train' created


## Generate embeddings

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

if packaging.version.parse(
  torch.__version__
) < packaging.version.parse("1.8.0"):
  dtype = torch.long
else:
  dtype = torch.int

In [6]:
dataset.compute_embeddings(
    model, 
    embeddings_field="embedding",
)

 100% |███████████| 118287/118287 [1.5h elapsed, 0s remaining, 20.5 samples/s]      


In [7]:
dataset.persistent = True

## Initializing the pinecone index and upserting the vectors

In [8]:
pinecone.init(api_key=PINECONE_KEY, environment="us-east4-gcp")

indices = pinecone.list_indexes()
if len(indices) > 0:
    pinecone.delete_index(indices[0])

In [40]:
index_name = "clip-image-search"
pinecone.create_index(
    index_name, 
    dimension=512, 
    metric="cosine", 
    pod_type="p1"
)
index = pinecone.Index(index_name)

In [42]:
# convert numpy arrays to lists for pinecone
embeddings = [arr.tolist() for arr in dataset.values("embedding")]
ids = ["http://images.cocodataset.org/train2017/" + file.split('/')[-1] for file in dataset.values("filepath")]

In [43]:
# create tuples of (id, embedding) for each sample
index_vectors = list(zip(ids, embeddings))

def upsert_vectors(index, vectors):
    num_vectors = len(vectors)
    num_vectors_per_step = 100
    num_steps = int(np.ceil(num_vectors/num_vectors_per_step))
    for i in range(num_steps):
        min_ind = num_vectors_per_step * i
        max_ind = min(num_vectors_per_step * (i+1), num_vectors)
        index.upsert(index_vectors[min_ind:max_ind])

upsert_vectors(index, index_vectors)

## Testing the image search

In [29]:
def get_text_embedding(prompt, clip_model):
    tokenizer = clip_model._tokenizer

    # standard start-of-text token
    sot_token = tokenizer.encoder["<|startoftext|>"]

    # standard end-of-text token
    eot_token = tokenizer.encoder["<|endoftext|>"]

    prompt_tokens = tokenizer.encode(prompt)
    all_tokens = [[sot_token] + prompt_tokens + [eot_token]]

    text_features = torch.zeros(
        len(all_tokens),
        clip_model.config.context_length,
        dtype=dtype,
        device=device,
    )

    # insert tokens into feature vector
    text_features[0, : len(all_tokens[0])] = torch.tensor(all_tokens)

    # encode text
    embedding = clip_model._model.encode_text(text_features).to(device)

    # convert to list for Pinecone
    return embedding.tolist()

In [47]:
prompt = "a smile"
query_vector = get_text_embedding(prompt, model)
top_k_samples = index.query(
    vector=query_vector,
    top_k=10,
    include_values=False
)

top_k_samples

{'matches': [{'id': 'http://images.cocodataset.org/train2017/000000506187.jpg',
              'score': 0.268776685,
              'values': []},
             {'id': 'http://images.cocodataset.org/train2017/000000546963.jpg',
              'score': 0.267506778,
              'values': []},
             {'id': 'http://images.cocodataset.org/train2017/000000110798.jpg',
              'score': 0.267011523,
              'values': []},
             {'id': 'http://images.cocodataset.org/train2017/000000048665.jpg',
              'score': 0.258640438,
              'values': []},
             {'id': 'http://images.cocodataset.org/train2017/000000423327.jpg',
              'score': 0.257749081,
              'values': []},
             {'id': 'http://images.cocodataset.org/train2017/000000445351.jpg',
              'score': 0.249218613,
              'values': []},
             {'id': 'http://images.cocodataset.org/train2017/000000362138.jpg',
              'score': 0.248322085,
              