In [16]:
# tweaked from https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel.forward.returns

# below is standard CLIP usage to score text snippets against a photo

from PIL import Image

import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"

image1 = Image.open(requests.get(url, stream=True).raw)
image2 = Image.open(requests.get(url, stream=True).raw)  # assume a different image

images = [image1, image2]

# the rest of this cell is standard use of CLIP
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True)

outputs = model(**inputs)

logits_per_image = outputs.logits_per_image  # this is the image-text similarity score

probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

We need to update the above so that we can calculate the scaled dot product scores between the images and images, rather than just between images and text.  below shows how to get the image embeddings.

I believe I've done that below working off of the source code [here](https://github.com/huggingface/transformers/blob/v4.29.1/src/transformers/models/clip/modeling_clip.py#L1074)

In [2]:
outputs.image_embeds.shape

torch.Size([2, 512])

In [3]:
# code for calculating similarity scores between images
import torch

image_embeds = outputs.image_embeds

logit_scale = model.logit_scale.exp()

In [15]:
# assume these are different sets of images

moodboard_images = images

recommendation_candidate_images = images

# text is just here to make the thing run
inputs1 = processor(text=["dummy"], images=moodboard_images, return_tensors="pt", padding=True)
inputs2 = processor(text=["dummy"], images=recommendation_candidate_images, return_tensors="pt", padding=True)

moodboard_image_embeds = model(**inputs1).image_embeds
recommendation_candidate_image_embeds = model(**inputs2).image_embeds

# hopefully, the closer these are, the higher the recommendation quality
recommender_scores = torch.matmul(
    recommendation_candidate_image_embeds,
    moodboard_image_embeds.t()
) * logit_scale

# 5.14.23 - autoencoder

In [17]:
import torch
from torch import nn

In [65]:
class Encoder(nn.Module):
    def __init__(self, in_channels, latent_dim, hidden_dims):
        super().__init__()
        self.latent_dim = latent_dim
        
        modules = []
        
        # Build Encoder
        for h_dim in hidden_dims:
            modules.append(
                nn.Sequential(
                    nn.Conv2d(in_channels, out_channels=h_dim,
                              kernel_size= 3, stride= 2, padding  = 1),
                    nn.BatchNorm2d(h_dim),
                    nn.LeakyReLU())
            )
            in_channels = h_dim

        self.encoder = nn.Sequential(*modules)
        self.fc_mu = nn.Linear(hidden_dims[-1], self.latent_dim)
        self.fc_var = nn.Linear(hidden_dims[-1], self.latent_dim)
        
    def forward(self, input):
        hid = self.encoder(input)
        
        hid = hid.sum([2, 3])
        
        mu, log_var = self.fc_mu(hid), self.fc_var(hid)
        return (mu, log_var)
    
    
class Decoder(nn.Module):
    def __init__(
        self, 
        encoder_in_channels, 
        encoder_latent_dim, 
        encoder_hidden_dims,
    ):
        super().__init__()
        # Build Decoder
        modules = []

        self.decoder_input = nn.Linear(
            encoder_latent_dim,
            encoder_hidden_dims[-1])
    
        hidden_dims = list(reversed(encoder_hidden_dims))
        self.hidden_dims = hidden_dims

        for i in range(len(hidden_dims) - 1):
            modules.append(
                nn.Sequential(
                    nn.ConvTranspose2d(hidden_dims[i],
                                       hidden_dims[i + 1],
                                       kernel_size=3,
                                       stride = 2,
                                       padding=1,
                                       output_padding=1),
                    nn.BatchNorm2d(hidden_dims[i + 1]),
                    nn.LeakyReLU())
            )

        self.decoder = nn.Sequential(*modules)

        self.final_layer = nn.Sequential(
                            nn.ConvTranspose2d(hidden_dims[-1],
                                               hidden_dims[-1],
                                               kernel_size=3,
                                               stride=2,
                                               padding=1,
                                               output_padding=1),
                            nn.BatchNorm2d(hidden_dims[-1]),
                            nn.LeakyReLU(),
                            nn.Conv2d(hidden_dims[-1], out_channels=encoder_in_channels,
                                      kernel_size= 3, padding= 1),
        )
    
    def forward(self, encoder_latent):
        result = self.decoder_input(encoder_latent)
        result = result.view(-1, self.hidden_dims[0], 1, 1)
        result = self.decoder(result)
        result = self.final_layer(result)
        return result    

In [54]:
encoder_hidden_dims = [32, 64, 128, 256, 512]
encoder_in_channels = 1
encoder_latent_dim = encoder_hidden_dims[-1] * 4


encoder = Encoder(
    encoder_in_channels,
    encoder_latent_dim,
    encoder_hidden_dims,
)

decoder = Decoder(
    encoder_in_channels,
    encoder_latent_dim,
    encoder_hidden_dims,
)

In [71]:
images = torch.randn(8, encoder_in_channels, 64, 64)

In [72]:
hid = encoder.encoder(images)

In [73]:
hid.shape

torch.Size([8, 512, 2, 2])

In [63]:
decoder(z).shape

torch.Size([8, 1, 32, 32])

In [75]:
nn.TransformerDecoderLayer?