In [1]:
# from transformers import BertModel, BertConfig, BertTokenizer, ViTConfig, ViTImageProcessor, ViTModel, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
# from transformers import SiglipImageProcessor, SiglipVisionModel, SiglipTokenizer, SiglipTextModel, SiglipProcessor, SiglipModel
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, CLIPTokenizer, CLIPTextModelWithProjection, CLIPProcessor, CLIPModel
import torch

vision_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
vision_model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")

device = "cuda" if torch.cuda.is_available() else "cpu"

vision_model.to(device)
text_model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


CLIPTextModelWithProjection(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm(

In [2]:
# Get image embedding
from PIL import Image

def get_image_embedding(path):
    with Image.open(path) as img:
        encoded_image_input = vision_processor(images=[img], return_tensors="pt")
    
        with torch.no_grad():
            image_outputs = vision_model(**encoded_image_input)

    return image_outputs.image_embeds.detach().numpy()[0]

image_embeds = get_image_embedding('../outputs/frames/frame1.png')
image_embeds

array([ 2.45595366e-01,  4.31194007e-02,  6.71354532e-02,  1.50591582e-01,
        5.48219025e-01, -2.07283974e-01,  3.35441202e-01, -8.86689305e-01,
       -4.01517957e-01, -3.29281896e-01,  3.73944566e-02,  5.32619059e-02,
        6.73917651e-01,  4.85287845e-01, -2.94649243e-01, -3.00068736e-01,
       -6.39913917e-01,  7.23498523e-01,  1.34650394e-01,  1.31560862e-01,
       -4.27026302e-03,  6.32782578e-01,  3.69540334e-01,  2.92050362e-01,
       -3.02810073e-01, -7.61406571e-02, -5.29183373e-02, -4.21642125e-01,
        3.94836724e-01, -2.10662737e-01, -1.04815409e-01,  3.59052002e-01,
       -5.13117194e-01, -3.33380461e-01, -5.03384322e-02,  3.70914489e-01,
       -4.63745445e-02, -2.42357418e-01,  7.19626099e-02,  1.49308491e+00,
       -2.63576284e-02, -4.27662641e-01,  2.29671851e-01, -1.81545854e-01,
       -2.50454307e-01,  2.19136691e+00,  6.89030051e-01, -3.36743027e-01,
        3.30458909e-01, -3.47266734e-01,  2.14879930e-01,  9.79231894e-02,
        1.65221214e-01,  

In [5]:
# Get text encoding
def get_text_embedding(text):
    with torch.no_grad():
        encoded_text_input = tokenizer([text],return_tensors="pt",padding=True)
        text_output = text_model(**encoded_text_input)

    return text_output.text_embeds.detach().numpy()[0]

text_embeds = get_text_embedding("It's Barcelona Paris - Saint Germain")
text_embeds

array([ 3.25445116e-01, -1.73046052e-01,  3.75780195e-01, -7.30709732e-02,
        2.01427445e-01, -2.83918947e-01,  7.75187090e-02, -5.76296985e-01,
       -2.29282007e-01,  6.78484440e-02, -1.83479920e-01,  1.83670491e-01,
        3.36990878e-02,  3.03947061e-01, -4.58058953e-01, -1.52292237e-01,
        6.41324043e-01, -1.84748143e-01, -1.48912996e-01,  3.31942439e-01,
        4.41840649e-01,  1.46992594e-01, -2.29114667e-02,  1.44857645e-01,
       -2.13152230e-01, -2.52798378e-01,  2.13436186e-01, -1.78617537e-01,
       -1.65691912e-01,  1.05545670e-02, -1.58319339e-01, -1.83525383e-02,
        7.63793141e-02,  4.12450194e-01, -3.08925688e-01,  1.89659402e-01,
       -8.75749439e-02, -2.32982770e-01,  2.20016733e-01,  2.12590158e-01,
        2.80660152e-01, -3.35606933e-01,  1.80693224e-01,  2.71735132e-01,
       -2.62041628e-01,  2.82379687e-01, -1.17166050e-01, -2.03063607e-01,
        6.57913685e-02, -5.67428648e-01, -3.56719196e-01, -5.62860847e-01,
       -2.10765511e-01,  

In [8]:
import numpy as np

def cos_sim(emb_1, emb_2):
    return np.dot(emb_1, emb_2) / (np.linalg.norm(emb_1) * np.linalg.norm(emb_2))

cos_sim(image_embeds, text_embeds)

0.26102865

In [30]:
query_embeddings = get_text_embedding(
    "Barcelona defeats Paris Saint-Germain 6-1 in an historic match in the Champions League, with Neymar scoring twice and Messi contributing a goal as well"
)

print(cos_sim(query_embeddings, image_embeds))
print(cos_sim(query_embeddings, text_embeds))

concatenated_embeddings = np.concatenate([image_embeds, text_embeds])
# About right in the middle
cos_sim(np.resize(query_embeddings, (1,len(concatenated_embeddings))), concatenated_embeddings)

0.23230827
0.7149211


array([0.47741628], dtype=float32)