In [1]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
model, preprocess = clip.load("RN50", device=device)
# print("")
# model, preprocess = clip.load("RN101", device=device)
# print("")
# model, preprocess = clip.load("RN50x4", device=device)
# print("")
# model, preprocess = clip.load("RN50x16", device=device)
# print("")
# model, preprocess = clip.load("RN50x64", device=device)

In [2]:
model

CLIP(
  (visual): ModifiedResNet(
    (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu1): ReLU(inplace=True)
    (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu2): ReLU(inplace=True)
    (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu3): ReLU(inplace=True)
    (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
     

In [37]:
image1 = preprocess(Image.open("CLIP.png"))
image2 = preprocess(Image.open("city.png"))
image = torch.stack([image1, image2])
classes = ["diagram", "dog", "cat", "animal", "book", "city"]
text = torch.cat([clip.tokenize(f"a photo of a {c}") for c in classes]).to(device)

In [38]:
image.shape, text.shape

(torch.Size([2, 3, 224, 224]), torch.Size([6, 77]))

In [39]:
image_features = model.encode_image(image)
text_features = model.encode_text(text)
image_features.shape, text_features.shape

(torch.Size([2, 1024]), torch.Size([6, 1024]))

In [40]:
logits_per_image, logits_per_text = model(image, text)
logits_per_image.shape, logits_per_text.shape

Parameter containing:
tensor(4.6052, requires_grad=True)
tensor(100.0000, grad_fn=<ExpBackward0>)


(torch.Size([2, 6]), torch.Size([6, 2]))

In [42]:
probs_per_image = logits_per_image.softmax(dim=-1)
probs_per_text = logits_per_text.softmax(dim=-1)
probs_per_image, probs_per_text

(tensor([[9.9276e-01, 2.2409e-03, 1.0966e-03, 2.9248e-03, 9.3108e-04, 5.0276e-05],
         [1.9476e-03, 1.0972e-03, 7.7613e-04, 1.9052e-03, 2.3030e-03, 9.9197e-01]],
        grad_fn=<SoftmaxBackward0>),
 tensor([[9.9966e-01, 3.3579e-04],
         [9.2265e-01, 7.7348e-02],
         [8.9191e-01, 1.0809e-01],
         [8.9966e-01, 1.0034e-01],
         [7.0249e-01, 2.9751e-01],
         [2.9592e-04, 9.9970e-01]], grad_fn=<SoftmaxBackward0>))

In [45]:
target, target.T

(tensor([[1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.]]),
 tensor([[1., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 1.]]))

In [43]:
probs_per_image = logits_per_image.softmax(dim=-1)
probs_per_text = logits_per_text.softmax(dim=-1)
target = torch.tensor([[1,0,0,0,0,0],[0,0,0,0,0,1]]).type(torch.float32)
loss = torch.nn.CrossEntropyLoss()
loss(probs_per_image, target), loss(probs_per_text, target.T)

(tensor(1.0495, grad_fn=<DivBackward1>),
 tensor(0.1045, grad_fn=<DivBackward1>))

In [33]:
# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")


Top predictions:

         diagram: 98.20%
           paper: 1.09%
          animal: 0.29%
             dog: 0.22%
             cat: 0.11%


In [34]:
image_features.shape, text_features.shape

(torch.Size([2, 1024]), torch.Size([6, 1024]))

In [35]:
logits_per_image = (100.0 * image_features @ text_features.T).softmax(dim=-1)
target = torch.tensor([[1,0,0,0,0,0]]).type(torch.float32)
torch.nn.CrossEntropyLoss()(similarity, target)

ValueError: Expected input batch_size (2) to match target batch_size (1).

In [26]:
similarity

tensor([[9.8203e-01, 2.2167e-03, 1.0847e-03, 2.8931e-03, 9.2102e-04, 1.0857e-02]])

In [29]:
similarity.shape, target.shape

(torch.Size([1, 6]), torch.Size([1, 6]))