# Getting the CLIP Model from GitHub

In [1]:
!conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Retrieving notices: ...working... done
Channels:
 - pytorch
 - rapidsai
 - nvidia
 - nodefaults
 - conda-forge
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
failed

LibMambaUnsatisfiableError: Encountered problems while solving:
  - package pytorch-1.7.1-py3.6_cpu_0 requires python >=3.6,<3.7.0a0, but none of the providers can be installed
  - package cuda-version-12.3-h32bc705_3 has constraint cudatoolkit 12.3|12.3.* conflicting with cudatoolkit-11.0.221-h6bb024c_0

Could not solve for environment specs
The following packages are incompatible
├─ [32mcuda-version 12.3** [0m is installable and it requires
│  └─ [32mcudatoolkit 12.3|12.3.* [0m, which can be installed;
├─ [31mcudatoolkit 11.0** [0m is not installable because it conflicts with any installable versions previously reported;
├─ [32mpin-1[0m is installable and it requires
│  └─ [32mpython 3.10.* [0m, which can be installed;
└─ [31mpytorch 1.7.1** [0m is not installable because the

In [49]:
import os
import zipfile
import clip
import torch
import torchvision
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import pandas as pd  
from tabulate import tabulate 

In [50]:
device = "cuda" if torch.cuda.is_available() else "cpu"

original_data_dir = '/kaggle/input/animal10-shape/original/val'
canny_data_dir = '/kaggle/input/animal10-shape/canny/val'  

classes = sorted(os.listdir(original_data_dir))

# Custom Class to Read Dataset

In [51]:
class CannyImageDataset(Dataset):
    def __init__(self, root_dir, classes, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = classes
        self.image_paths = []
        self.labels = []

        for class_index, class_name in enumerate(classes):
            class_folder = os.path.join(root_dir, class_name)
            for image_name in os.listdir(class_folder):
                if image_name.endswith(('.png', '.jpg', '.jpeg')):
                    self.image_paths.append(os.path.join(class_folder, image_name))
                    self.labels.append(class_index)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label, img_path

In [52]:
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

original_dataset = CannyImageDataset(root_dir=original_data_dir, classes=classes, transform=preprocess)
canny_dataset = CannyImageDataset(root_dir=canny_data_dir, classes=classes, transform=preprocess)

original_dataloader = DataLoader(original_dataset, batch_size=32, shuffle=False, num_workers=2)
canny_dataloader = DataLoader(canny_dataset, batch_size=32, shuffle=False, num_workers=2)

model, _ = clip.load("ViT-B/32", device=device)

text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in classes]).to(device)

# Evaluate CLIP on dataset

In [53]:
def evaluate(model, dataloader, text_inputs):
    total = 0
    correct = 0
    predictions = []
    labels = []

    model.eval()

    with torch.no_grad():
        for images, lbls, img_paths in tqdm(dataloader):
            images = images.to(device)
            lbls = lbls.to(device)

            image_features = model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features = model.encode_text(text_inputs)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            _, predicted = similarity.max(dim=1)

            total += lbls.size(0)
            correct += (predicted == lbls).sum().item()
            predictions.extend(predicted.cpu().numpy())
            labels.extend(lbls.cpu().numpy())

    accuracy = correct / total
    return accuracy, predictions, labels

# Calculate accuracies and shape bias

In [54]:
original_accuracy, original_predictions, original_labels = evaluate(model, original_dataloader, text_inputs)
print("Accuracy on original dataset: ", original_accuracy)
canny_accuracy, canny_predictions, canny_labels = evaluate(model, canny_dataloader, text_inputs)
print("Accuracy on canny dataset: ", canny_accuracy)
print("Shape bias of the model: ", canny_accuracy/original_accuracy)

100%|██████████| 32/32 [00:05<00:00,  5.79it/s]


Accuracy on original dataset:  0.602


100%|██████████| 32/32 [00:05<00:00,  5.84it/s]

Accuracy on canny dataset:  0.318
Shape bias of the model:  0.5282392026578073





# See class wise data

In [59]:
total_per_class = {class_name: 0 for class_name in classes}
correct_original_incorrect_canny = {class_name: 0 for class_name in classes}
correct_both = {class_name: 0 for class_name in classes}
incorrect_both = {class_name: 0 for class_name in classes}
correct_canny_incorrect_original = {class_name: 0 for class_name in classes}

for i in range(len(original_labels)):
    class_name = classes[original_labels[i]]
    total_per_class[class_name] += 1

    if original_predictions[i] == original_labels[i] and canny_predictions[i] != original_labels[i]:
        correct_original_incorrect_canny[class_name] += 1
    if original_predictions[i] == original_labels[i] and canny_predictions[i] == original_labels[i]:
        correct_both[class_name] += 1
    if original_predictions[i] != original_labels[i] and canny_predictions[i] != original_labels[i]:
        incorrect_both[class_name] += 1
    if original_predictions[i] != original_labels[i] and canny_predictions[i] == original_labels[i]:
        correct_canny_incorrect_original[class_name] += 1

percentages = {}
for class_name in classes:
    percentages[class_name] = {
        "correct_original_incorrect_canny": (correct_original_incorrect_canny[class_name] / total_per_class[class_name]) * 100 if total_per_class[class_name] > 0 else 0,
        "correct_both": (correct_both[class_name] / total_per_class[class_name]) * 100 if total_per_class[class_name] > 0 else 0,
        "incorrect_both": (incorrect_both[class_name] / total_per_class[class_name]) * 100 if total_per_class[class_name] > 0 else 0,
        "correct_canny_incorrect_original": (correct_canny_incorrect_original[class_name] / total_per_class[class_name]) * 100 if total_per_class[class_name] > 0 else 0,
    }

percentage_table = []
for class_name, data in percentages.items():
    percentage_table.append({
        "Class": class_name,
        "Correct Original, Incorrect Canny (%)": round(data["correct_original_incorrect_canny"], 1),
        "Correct Both (%)": round(data["correct_both"], 1),
        "Incorrect Both (%)": round(data["incorrect_both"], 1),
        "Correct Canny, Incorrect Original (%)": round(data["correct_canny_incorrect_original"], 1),
    })

df = pd.DataFrame(percentage_table)

print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False, colalign=("left",) * len(df.columns)))

+------------+---------------------------------------+------------------+--------------------+---------------------------------------+
| Class      | Correct Original, Incorrect Canny (%) | Correct Both (%) | Incorrect Both (%) | Correct Canny, Incorrect Original (%) |
+------------+---------------------------------------+------------------+--------------------+---------------------------------------+
| cane       | 0.0                                   | 0.0              | 99.0               | 1.0                                   |
| cavallo    | 58.0                                  | 26.0             | 12.0               | 4.0                                   |
| elefante   | 23.0                                  | 77.0             | 0.0                | 0.0                                   |
| farfalla   | 58.0                                  | 34.0             | 6.0                | 2.0                                   |
| gallina    | 68.0                                  | 

# Colour and Texture Bias

In [72]:
import torch
import clip
from PIL import Image
import os
import numpy as np

In [75]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

original_dir = '/kaggle/input/animal10-subset/original_subset/small_animal_dataset_updated'
colour_dir = '/kaggle/input/animal10-subset/decolourized_subset/'
texture_dir = '/kaggle/input/animal10-subset/textured_subset/Stylized_images'
# in the textured dir the class names are like cane_stylized, cavallo_stylized etc. whereas in the other two they are like cane, cavallo etc.

In [76]:
def load_images_and_classes(directory, suffix=''):
    images = []
    classes = []
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            for img_file in os.listdir(class_path):
                if img_file.endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(class_path, img_file)
                    images.append(img_path)
                    classes.append(class_name + suffix)  # Append suffix if needed
    return images, classes

original_images, original_classes = load_images_and_classes(original_dir)

colour_images, colour_classes = load_images_and_classes(colour_dir)

texture_images, texture_classes = load_images_and_classes(texture_dir, suffix='_stylized')

In [81]:
def calculate_accuracy(images, classes):
    correct = 0
    total = len(images)
    
    class_names = list(set(classes))
    
    text_inputs = clip.tokenize([f"a photo of {class_name}" for class_name in class_names]).to(device)

    with torch.no_grad():
        text_embeddings = model.encode_text(text_inputs)

    for img_path, class_name in zip(images, classes):
        
        image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)

        with torch.no_grad():
            image_embedding = model.encode_image(image)

        similarities = (image_embedding @ text_embeddings.T).squeeze(0)
        
        predicted_class_index = similarities.argmax().item()
        predicted_class = class_names[predicted_class_index]

        if predicted_class == class_name:
            correct += 1

    accuracy = correct / total * 100
    return accuracy

In [82]:
original_accuracy = calculate_accuracy(original_images, original_classes)
print(f'Original Accuracy: {original_accuracy:.2f}%')

colour_accuracy = calculate_accuracy(colour_images, colour_classes)
print(f'Colour Accuracy: {colour_accuracy:.2f}%')

texture_accuracy = calculate_accuracy(texture_images, texture_classes)
print(f'Texture Accuracy: {texture_accuracy:.2f}%')

Original Accuracy: 65.01%
Colour Accuracy: 61.60%
Texture Accuracy: 37.14%


In [83]:
colour_bias = colour_accuracy / original_accuracy
texture_bias = texture_accuracy / original_accuracy
print(f'Colour Bias: {colour_bias}')
print(f'Texture Bias: {texture_bias}')

Colour Bias: 0.9475444617784712
Texture Bias: 0.5713394250055717
