In [1]:
!conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Retrieving notices: ...working... done
Channels:
 - pytorch
 - rapidsai
 - nvidia
 - nodefaults
 - conda-forge
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
failed

LibMambaUnsatisfiableError: Encountered problems while solving:
  - package pytorch-1.7.1-py3.6_cpu_0 requires python >=3.6,<3.7.0a0, but none of the providers can be installed
  - package cuda-version-12.3-h32bc705_3 has constraint cudatoolkit 12.3|12.3.* conflicting with cudatoolkit-11.0.221-h6bb024c_0

Could not solve for environment specs
The following packages are incompatible
├─ [32mcuda-version 12.3** [0m is installable and it requires
│  └─ [32mcudatoolkit 12.3|12.3.* [0m, which can be installed;
├─ [31mcudatoolkit 11.0** [0m is not installable because it conflicts with any installable versions previously reported;
├─ [32mpin-1[0m is installable and it requires
│  └─ [32mpython 3.10.* [0m, which can be installed;
└─ [31mpytorch 1.7.1** [0m is not installable because the

In [2]:
import clip
import torch
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("/kaggle/input/cat-image/cat1.jpg")).unsqueeze(0).to(device)
text = clip.tokenize(["a cute cat", "a normal cat", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs) 

Label probs: [[0.7754  0.06464 0.16   ]]


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(), 
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  
])

testset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=preprocess)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=2)

cifar10_classes = [
    'airplane', 'automobile', 'bird', 'cat', 'deer',
    'dog', 'frog', 'horse', 'ship', 'truck'
]

Files already downloaded and verified


In [11]:
model, preprocess = clip.load("ViT-B/32", device=device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar10_classes]).to(device)

In [12]:
def evaluate(model, testloader, text_inputs):
    total = 0
    correct = 0

    model.eval()

    with torch.no_grad():
        for images, labels in tqdm(testloader):
            
            images = images.to(device)
            labels = labels.to(device)

            image_features = model.encode_image(images)

            image_features /= image_features.norm(dim=-1, keepdim=True)

            text_features = model.encode_text(text_inputs)

            text_features /= text_features.norm(dim=-1, keepdim=True)

            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

            _, predicted = similarity.max(dim=1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

accuracy = evaluate(model, testloader, text_inputs)
print(f"Accuracy of CLIP on CIFAR-10: {accuracy:.4f}")

100%|██████████| 782/782 [01:39<00:00,  7.83it/s]

Accuracy of CLIP on CIFAR-10: 0.8514



