In [5]:
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from transformers import CLIPModel, CLIPProcessor
import os
from torchvision.datasets import ImageFolder

In [2]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

train_dataset = datasets.SVHN(root='./data', download=True, split='train', transform=transform)
test_dataset = datasets.SVHN(root='./data', download=True, split='test', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Downloading http://ufldl.stanford.edu/housenumbers/train_32x32.mat to ./data/train_32x32.mat


100%|██████████| 182040794/182040794 [00:20<00:00, 8878409.02it/s] 


Downloading http://ufldl.stanford.edu/housenumbers/test_32x32.mat to ./data/test_32x32.mat


100%|██████████| 64275384/64275384 [00:11<00:00, 5446918.11it/s]


In [3]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model.eval()

config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [4]:
def evaluate_clip_model(model, processor, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  

    correct = 0
    total = 0

    text_labels = [f"This is a digit {i}" for i in range(10)]
    
    text_inputs = processor(text=text_labels, return_tensors="pt", padding=True)
    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}  

    model.eval()  
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)

            inputs = processor(images=images, return_tensors="pt", padding=True, do_rescale=False)
            inputs = {k: v.to(device) for k, v in inputs.items()}  

            outputs = model(pixel_values=inputs['pixel_values'],
                            input_ids=text_inputs['input_ids'],
                            attention_mask=text_inputs['attention_mask'])

            logits_per_image = outputs.logits_per_image
            predicted = torch.argmax(logits_per_image, dim=1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy


In [11]:
svhn_accuracy = evaluate_clip_model(model, processor, test_loader)
print(f'Model Accuracy on SVHN using CLIP: {svhn_accuracy:.2f}%')


Model Accuracy on SVHN using CLIP: 44.06%


In [12]:
def get_pacs_loaders(batch_size=64):
    print("Entered get_pacs_loaders function")
    
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    root_dir = '/kaggle/input/pacs-dataset/pacs_data/pacs_data'  

    try:
        pacs_photos = ImageFolder(root=os.path.join(root_dir, 'photo'), transform=transform)
        pacs_art = ImageFolder(root=os.path.join(root_dir, 'art_painting'), transform=transform)
        pacs_cartoon = ImageFolder(root=os.path.join(root_dir, 'cartoon'), transform=transform)
        pacs_sketch = ImageFolder(root=os.path.join(root_dir, 'sketch'), transform=transform)
    except FileNotFoundError as e:
        print(f"Error loading dataset: {e}")
        return

    loaders = {
        'Photos': DataLoader(pacs_photos, batch_size=batch_size, shuffle=True),
        'Art_paintings': DataLoader(pacs_art, batch_size=batch_size, shuffle=True),
        'Cartoons': DataLoader(pacs_cartoon, batch_size=batch_size, shuffle=True),
        'Sketches': DataLoader(pacs_sketch, batch_size=batch_size, shuffle=True),
    }

    print(f"Number of photos: {len(pacs_photos)}")
    print(f"Number of art paintings: {len(pacs_art)}")
    print(f"Number of cartoons: {len(pacs_cartoon)}")
    print(f"Number of sketches: {len(pacs_sketch)}")

    return loaders

def run_clip_on_pacs(loaders):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    model.eval()

    category_accuracies = {}

    for category, loader in loaders.items():
        print(f"Processing {category}...")
        
        correct = 0
        total = 0

        all_preds = []
        all_labels = []

        for images, labels in loader:
            images = images.to(device)  

            text_inputs = [category] * images.size(0)
            inputs = processor(text=text_inputs, images=images, return_tensors="pt", padding=True)

            for key in inputs.keys():
                inputs[key] = inputs[key].to(device)

            with torch.no_grad():
                outputs = model(**inputs)

            logits_per_image = outputs.logits_per_image  
            probs = logits_per_image.softmax(dim=1)  

            preds = probs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())  
            all_labels.extend(labels.cpu().numpy())  

            total += labels.size(0)
            correct += (preds.cpu() == labels.cpu()).sum().item()

        accuracy = 100 * correct / total
        category_accuracies[category] = accuracy
        print(f'{category} Accuracy: {accuracy:.2f}%')

    # Print out the accuracy for each category
    print("\nOverall category-wise accuracies:")
    for category, acc in category_accuracies.items():
        print(f"{category}: {acc:.2f}%")

if __name__ == "__main__":
    print("Loading PACS dataset...")
    loaders = get_pacs_loaders()
    if loaders:
        run_clip_on_pacs(loaders)

Loading PACS dataset...
Entered get_pacs_loaders function
Number of photos: 1670
Number of art paintings: 2048
Number of cartoons: 2344
Number of sketches: 3929
Processing Photos...
Photos Accuracy: 11.32%
Processing Art_paintings...
Art_paintings Accuracy: 18.51%
Processing Cartoons...
Cartoons Accuracy: 16.60%
Processing Sketches...
Sketches Accuracy: 19.65%

Overall category-wise accuracies:
Photos: 11.32%
Art_paintings: 18.51%
Cartoons: 16.60%
Sketches: 19.65%
