<a href="https://colab.research.google.com/github/javiimo/ImageClassificationAssignment/blob/main/CLIPClass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

import clip
import torch
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import copy



Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.2.0
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-s8sxls1i
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-s8sxls1i
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->clip==1.0)
  Using ca

In [None]:
# To be able to access the folder of datasets (the path will be different for each account I think)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load our data sets


In [None]:

class CLIPModel:
    def __init__(self, model_name='ViT-B/32', device=None):
        self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load(model_name, self.device)
        #self.model = self.convert_model_parameters_to_float32(self.model)
        #self.optimizer = optim.Adam(self.model.parameters(), lr=0.000000001, weight_decay=2e-4) #Numerical inestability
        self.optimizer = optim.SGD(self.model.parameters(), lr=1000, momentum=0.9)
        self.text_features = None

    def require_CLIP_gradients(self, state = True):
        if state != self.requiring_grads #don't change if the state is already OK
            for param in self.model.parameters():
                param.requires_grad = state
            self.requiring_grads = state

    def convert_model_parameters_to_float32(self, model):
        for param in model.parameters():
            param.data = param.data.to(torch.float32)
        return model

    def load_data(self):
        cifar100 = torchvision.datasets.CIFAR100(root='./data', download=True, train=False)
        return cifar100

    #This are heuristic labels
    def tokenize_labels(self, classes):
        text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in classes]).to(self.device)
        with torch.no_grad():
            self.text_features = self.model.encode_text(text_inputs)
            self.text_features /= self.text_features.norm(dim=-1, keepdim=True)
        return self.text_features

    def augment_image(self, image, num_augmentations=100):
        augmentations = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.5),
            transforms.RandomRotation(degrees=30),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
            transforms.RandomResizedCrop(size=224, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
        ])
        augmented_images = [self.preprocess(image).unsqueeze(0).to(self.device)]
        for _ in range(num_augmentations):
            augmented_images.append(self.preprocess(augmentations(image)).unsqueeze(0).to(self.device))
        batch = torch.vstack(augmented_images)
        return batch

    def compute_entropy(self, x):
        log_x = torch.log2(x.clamp_min(1e-20))
        entropy = -torch.sum(x * log_x)
        return entropy

    def class_probabilities(self, text_features, image_features):
        #Compute cosine similarities
        return (image_features @ text_features.T).softmax(dim=-1)

    def confidence_selection(self, similarities, percentile = 0.8):
        entropies = torch.tensor([self.compute_entropy(row) for row in similarities])
        sorted_entropies, _ = torch.sort(entropies, descending=True)
        threshold = sorted_entropies[int(len(sorted_entropies) * percentile)]
        boolean_mask = entropies < threshold
        return similarities[boolean_mask]

    def entropy_loss_MEMO(self, image_features, text_features = None, conf_sel=False):
        if text_features is None:
            text_features = self.text_features
        similarities = self.class_probabilities(text_features, image_features)
        # Apply confidence selection to rule out high entropy augmentations
        if conf_sel:
            similarities = self.confidence_selection(similarities)
        # Compute the entropy of every text caption accross all augmentations
        entropies = [self.compute_entropy(row) for row in similarities]
        return torch.stack(entropies).mean()

    def entropy_loss_TPT(self, image_features, text_features = None):
        if text_features is None:
            text_features = self.text_features
        similarities = self.class_probabilities(text_features, image_features)
        # Confidence selection for the augmented views:
        similarities = self.confidence_selection(similarities)
        # Average the caption probabilities across all augmentations
        avg_probs = torch.tensor([row.mean() for row in similarities.T])
        # Compute the entropy of the averaged probability distribution
        return self.compute_entropy(avg_probs), avg_probs

    def grad_descent_step(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, image):
        image = self.preprocess(image).unsqueeze(0).to(self.device)
        with torch.no_grad():
            image_features = self.model.encode_image(image)
            norms = image_features.norm(dim=-1, keepdim=True)
            if (norms == 0).any():
                print("Zero norm found in image features")
            image_features = image_features / norms.clamp_min(1e-10)

        similarity = self.class_probabilities(self.text_features, image_features)
        prediction = torch.argmax(similarity).item()
        entropy = float(self.compute_entropy(similarity))
        return prediction, similarity, entropy

    def MEMO(self, image, num_augmentations=100, conf_sel = False):
        # Save original parameters
        original_params = {name: param.clone() for name, param in self.model.named_parameters()}

        # Require gradients to update the CLIP parameters
        require_CLIP_gradients(state = True)
        try:
            batch = self.augment_image(image, num_augmentations)
            image_features = self.model.encode_image(batch)
            norms = image_features.norm(dim=-1, keepdim=True)
            if (norms == 0).any():
                print("Zero norm found in image features")
            image_features = image_features / norms.clamp_min(1e-10)

            loss = self.entropy_loss_MEMO(image_features, conf_sel= conf_sel)
            print(loss)
            self.grad_descent_step(loss)

            if any(torch.isnan(param).any() for param in self.model.parameters()):
                print("nan values detected in model parameters after updating")
            # Predict using the updated model
            prediction, similarity, entropy = self.predict(image)
        finally:
            # Restore original parameters
            with torch.no_grad():
                for name, param in self.model.named_parameters():
                    param.copy_(original_params[name])
        return prediction, similarity, entropy

    def TPT(self, image, num_augmentations=100):
        batch = self.augment_image(image, num_augmentations)
        image_features = self.model.encode_image(batch)
        norms = image_features.norm(dim=-1, keepdim=True)
        if (norms == 0).any():
            print("Zero norm found in image features")
        image_features = image_features / norms.clamp_min(1e-10)

        entropy, avg_probs = self.entropy_loss_TPT(image_features)
        prediction = torch.argmax(avg_probs).item()
        return prediction, float(entropy)

    def train_CoOp(self):
        #Prevent CLIP parameters from changing
        self.require_CLIP_gradients(state=False)


# Preparing the class for usage
clip_model = CLIPModel()
cifar100 = clip_model.load_data()
clip_model.tokenize_labels(cifar100.classes)
image, class_id = cifar100[3637]

Files already downloaded and verified


In [None]:
# Prediction using CLIP out of the box
prediction1, similarity1, entropy1 = clip_model.predict(image)
print(similarity1, prediction1, entropy1)

tensor([[0.0101, 0.0102, 0.0102, 0.0099, 0.0099, 0.0100, 0.0100, 0.0103, 0.0098,
         0.0100, 0.0101, 0.0100, 0.0097, 0.0098, 0.0101, 0.0099, 0.0101, 0.0098,
         0.0102, 0.0101, 0.0099, 0.0101, 0.0099, 0.0098, 0.0100, 0.0099, 0.0103,
         0.0104, 0.0100, 0.0102, 0.0096, 0.0100, 0.0103, 0.0098, 0.0096, 0.0100,
         0.0101, 0.0098, 0.0098, 0.0099, 0.0101, 0.0103, 0.0104, 0.0100, 0.0104,
         0.0099, 0.0100, 0.0098, 0.0098, 0.0100, 0.0103, 0.0102, 0.0100, 0.0100,
         0.0098, 0.0099, 0.0100, 0.0099, 0.0099, 0.0101, 0.0100, 0.0101, 0.0100,
         0.0099, 0.0100, 0.0100, 0.0100, 0.0100, 0.0096, 0.0096, 0.0099, 0.0098,
         0.0101, 0.0098, 0.0101, 0.0099, 0.0097, 0.0103, 0.0107, 0.0102, 0.0098,
         0.0098, 0.0100, 0.0104, 0.0099, 0.0102, 0.0101, 0.0099, 0.0101, 0.0098,
         0.0099, 0.0100, 0.0099, 0.0106, 0.0098, 0.0097, 0.0098, 0.0098, 0.0100,
         0.0103]], device='cuda:0', dtype=torch.float16) 78 6.64453125


In [None]:
#Prediction using MEMO at test time
prediction2, similarity2, entropy2 = clip_model.MEMO(image, num_augmentations=100)
print(similarity2, prediction2, entropy2)

tensor(6.6445, device='cuda:0', dtype=torch.float16, grad_fn=<MeanBackward0>)
tensor([[0.0108, 0.0098, 0.0103, 0.0098, 0.0090, 0.0096, 0.0095, 0.0099, 0.0098,
         0.0100, 0.0102, 0.0103, 0.0091, 0.0095, 0.0101, 0.0094, 0.0104, 0.0096,
         0.0103, 0.0102, 0.0094, 0.0100, 0.0099, 0.0097, 0.0093, 0.0097, 0.0094,
         0.0107, 0.0102, 0.0109, 0.0091, 0.0098, 0.0094, 0.0107, 0.0087, 0.0101,
         0.0094, 0.0097, 0.0093, 0.0101, 0.0101, 0.0109, 0.0100, 0.0101, 0.0105,
         0.0095, 0.0103, 0.0120, 0.0097, 0.0102, 0.0095, 0.0103, 0.0120, 0.0102,
         0.0097, 0.0087, 0.0122, 0.0102, 0.0104, 0.0127, 0.0107, 0.0105, 0.0105,
         0.0098, 0.0096, 0.0094, 0.0091, 0.0093, 0.0090, 0.0097, 0.0101, 0.0093,
         0.0092, 0.0094, 0.0089, 0.0097, 0.0092, 0.0101, 0.0108, 0.0098, 0.0094,
         0.0092, 0.0107, 0.0111, 0.0099, 0.0106, 0.0104, 0.0103, 0.0101, 0.0105,
         0.0098, 0.0098, 0.0102, 0.0104, 0.0094, 0.0094, 0.0116, 0.0098, 0.0101,
         0.0105]], device='cuda

In [None]:
# Prediction using TPT
prediction3, entropy3 = clip_model.TPT(image)
print(entropy3)
print(prediction3)

nan
0


In [None]:
#Prediction using MEMO with confidence selection at test time
prediction4, similarity4, entropy4 = clip_model.MEMO(image, num_augmentations=100, conf_sel=True)
print(similarity4, prediction4, entropy4)

RuntimeError: stack expects a non-empty TensorList

In [None]:
# Check if the similarities after the two MEMO implementations (with confidence selection and without it) are the same or not
s1 = similarity1.to(torch.float16)
s2 = similarity2.to(torch.float16)
s4 = similarity4.to(torch.float16)
print('Clip vs Memo without conf sel')
print(s1==s2)
print('Clip vs Memo with conf sel')
print(s1==s4)
print('Memo with conf sel vs Memo without conf sel')
print(s2==s4)