In [18]:
!pip install transformers torch torchvision

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [19]:
import torch
from torchvision import datasets, transforms
from transformers import CLIPProcessor, CLIPModel
import torch.nn.functional as F
from tqdm.notebook import tqdm

In [20]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [22]:
cifar_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in tqdm(dataloader):
            images, labels = images.to(device), labels.to(device)

            image_features = model.get_image_features(images)
            image_features = F.normalize(image_features, p=2, dim=-1)

            text_inputs = processor(text=cifar_classes, return_tensors="pt", padding=True).to(device)
            text_features = model.get_text_features(**text_inputs)
            text_features = F.normalize(text_features, p=2, dim=-1)

            similarity = torch.matmul(image_features, text_features.T)

            predictions = similarity.argmax(dim=1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    return correct / total

accuracy = evaluate(model, test_loader)
print(f"Test Accuracy: {accuracy:.4f}")

  0%|          | 0/157 [00:00<?, ?it/s]

Test Accuracy: 0.8242
