In [16]:
# tweaked from https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel.forward.returns

# below is standard CLIP usage to score text snippets against a photo

from PIL import Image

import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"

image1 = Image.open(requests.get(url, stream=True).raw)
image2 = Image.open(requests.get(url, stream=True).raw)  # assume a different image

images = [image1, image2]

# the rest of this cell is standard use of CLIP
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True)

outputs = model(**inputs)

logits_per_image = outputs.logits_per_image  # this is the image-text similarity score

probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

We need to update the above so that we can calculate the scaled dot product scores between the images and images, rather than just between images and text.  below shows how to get the image embeddings.

I believe I've done that below working off of the source code [here](https://github.com/huggingface/transformers/blob/v4.29.1/src/transformers/models/clip/modeling_clip.py#L1074)

In [2]:
outputs.image_embeds.shape

torch.Size([2, 512])

In [3]:
# code for calculating similarity scores between images
import torch

image_embeds = outputs.image_embeds

logit_scale = model.logit_scale.exp()

In [15]:
# assume these are different sets of images

moodboard_images = images

recommendation_candidate_images = images

# text is just here to make the thing run
inputs1 = processor(text=["dummy"], images=moodboard_images, return_tensors="pt", padding=True)
inputs2 = processor(text=["dummy"], images=recommendation_candidate_images, return_tensors="pt", padding=True)

moodboard_image_embeds = model(**inputs1).image_embeds
recommendation_candidate_image_embeds = model(**inputs2).image_embeds

# hopefully, the closer these are, the higher the recommendation quality
recommender_scores = torch.matmul(
    recommendation_candidate_image_embeds,
    moodboard_image_embeds.t()
) * logit_scale