In [None]:
! pip install git+https://github.com/openai/CLIP.git

In [15]:
# read categories
import os
#os.chdir('/movs_classification_2023/')

import json

with open('class_to_idx.json', 'r') as f:
    class_to_idx = json.load(f)

cat_names = list(class_to_idx.keys())
print(cat_names)

['Bear', 'Brown bear', 'Bull', 'Camel', 'Canary', 'Cat', 'Caterpillar', 'Cattle', 'Centipede', 'Cheetah', 'Chicken', 'Crab', 'Crocodile', 'Deer', 'Dog', 'Duck', 'Eagle', 'Elephant', 'Fish', 'Fox', 'Frog', 'Giraffe', 'Goat', 'Goldfish', 'Goose', 'Hamster', 'Harbor seal', 'Hedgehog', 'Hippopotamus', 'Horse', 'Jaguar', 'Jellyfish', 'Kangaroo', 'Koala', 'Ladybug', 'Leopard', 'Lion', 'Lizard', 'Lynx', 'Magpie', 'Monkey', 'Moths and butterflies', 'Mouse', 'Mule', 'Ostrich', 'Otter', 'Owl', 'Panda', 'Parrot', 'Penguin', 'Pig', 'Polar bear', 'Rabbit', 'Raccoon', 'Raven', 'Red panda', 'Rhinoceros', 'Scorpion', 'Sea lion', 'Sea turtle', 'Seahorse', 'Shark', 'Sheep', 'Shrimp', 'Snail', 'Snake', 'Sparrow', 'Spider', 'Squid', 'Squirrel', 'Starfish', 'Swan', 'Tick', 'Tiger', 'Tortoise', 'Turkey', 'Turtle', 'Whale', 'Woodpecker', 'Worm', 'Zebra']


In [16]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)


categories = cat_names

# Preprocess the category names into a format suitable for CLIP
category_texts = clip.tokenize(categories).to(device)


In [17]:
# Load and preprocess  image
image_path = "sample/4.jpg"
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

# Encode the image and the category texts
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(category_texts)

# Compute the similarity between the image and each category
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarities = (100.0 * image_features @ text_features.T).softmax(dim=-1)

# Find the category with the highest similarity score
best_category_index = similarities.argmax(dim=-1).item()
best_category = categories[best_category_index]

print(f"The image is classified as: {best_category}")


The image is classified as: Camel


In [13]:
# top 5 categories CLIP

def get_categories_clip(new_image_path):
    import pandas as pd
    from PIL import Image
    import clip
    import json

    import torch
    from torchvision import transforms
    import torchvision.models as models
    import torch.nn as nn
    import torch.nn.functional as F
    
    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load CLIP model
    model, preprocess = clip.load("ViT-B/32", device)
    
    # Load cat names
    with open('class_to_idx.json', 'r') as f:
        class_to_idx = json.load(f)

    # Get and preprocess cat names into a format suitable for CLIP
    cat_names = list(class_to_idx.keys())
    categories = cat_names    
    category_texts = clip.tokenize(categories).to(device)

    # Load image
    image_path = new_image_path  # test image path
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

    # Encode the image and the category texts
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(category_texts)

    # Compute the similarity between the image and each category
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarities = (100.0 * image_features @ text_features.T).softmax(dim=-1)

    # Get the top 5 categories and their probabilities
    top_num = 5
    top_prob, top_catid = torch.topk(similarities, top_num)

    # Convert to Python data types
    top_prob = top_prob.cpu().numpy()[0]
    top_catid = top_catid.cpu().numpy()[0]

    # Map indices to class names and prepare predictions
    predictions = []
    for i in range(top_num):
        predicted_class_name = categories[top_catid[i]]
        predicted_probability = top_prob[i]
        predictions.append({'Category ID': predicted_class_name,
                            'Probability': predicted_probability})

    df = pd.DataFrame(predictions)
    df_string = df.to_string(index=False)
    print(df_string)

    return df_string  # return df



In [12]:
get_categories_clip('sample/5.jpg')

Using device: cuda
Category ID  Probability
    Chicken     0.958496
     Turkey     0.011513
       Duck     0.005699
     Canary     0.004105
     Parrot     0.003298


'Category ID  Probability\n    Chicken     0.958496\n     Turkey     0.011513\n       Duck     0.005699\n     Canary     0.004105\n     Parrot     0.003298'

In [19]:
# Testing CLIP model
from utils.clip import test_clip
from utils.vit import prepare_data_vit


image_folder_path='Data_small'

print('Selected model: CLIP (zero-shot). Starting testing')
device, no_of_classes, train_loader, test_loader, dataloader = \
    prepare_data_vit(image_folder_path) # CLIP uses the same approach as VIT (so it's okay)
# Load cat names
class_to_idx_path = 'class_to_idx.json'

metrics_dict = test_clip(class_to_idx_path, test_loader)
metrics_dict
        


Selected model: CLIP (zero-shot). Starting testing
Using 32 workers for data loading
Using cuda
Using device: cuda


Testing: 100%|██████████| 346/346 [00:10<00:00, 32.50it/s]


Accuracy: 0.806483745359051
Precision: 0.8725240529868838
Recall: 0.806483745359051
F1 Score: 0.8252605390772069


{'accuracy': 0.806483745359051,
 'precision': 0.8725240529868838,
 'recall': 0.806483745359051,
 'f1': 0.8252605390772069}