In [1]:
import random
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.utils as utils
import matplotlib.pyplot as plt
import numpy as np
import tarfile
import os
from PIL import Image

from timeit import default_timer as timer
from tqdm.auto import tqdm
from torchvision import datasets, models, transforms
from torchsummary import summary
from typing import Callable, Dict, List, Tuple, Union
from torch.optim import lr_scheduler
# from torch.utils.tensorboard import SummaryWriter
# import skimage

import clip #importing clip model

print("Torch version:", torch.__version__)

  from .autonotebook import tqdm as notebook_tqdm


Torch version: 2.0.0+cu117


In [2]:
mean_CIFAR100 = [0.48145466, 0.4578275, 0.40821073]
std_CIFAR100 = [0.26862954, 0.26130258, 0.27577711]

class NormalizeInverse(torchvision.transforms.Normalize):
    def __init__(self, mean: List[float], std: List[float]) -> None:
        """Reconstructs the images in the input domain by inverting 
        the normalization transformation.

        Args:
            mean: the mean used to normalize the images.
            std: the standard deviation used to normalize the images.
        """
        mean = torch.as_tensor(mean)
        std = torch.as_tensor(std)
        std_inv = 1 / (std + 1e-7)
        mean_inv = -mean * std_inv
        super().__init__(mean=mean_inv, std=std_inv)

    def __call__(self, tensor):
        return super().__call__(tensor.clone())

def show_grid(dataset: torchvision.datasets.ImageFolder, 
              process: Callable = None) -> None:
    """Shows a grid with random images taken from the dataset.

    Args:
        dataset: the dataset containing the images.
        process: a function to apply on the images before showing them.        
    """
    fig = plt.figure(figsize=(15, 5))
    indices_random = np.random.randint(10, size=10, high=len(dataset))
    #indices_random= [x for x in range(8)]

    for count, idx in enumerate(indices_random):
        fig.add_subplot(2, 5, count + 1)
        title = dataset.classes[dataset[idx][1]]
        plt.title(title)
        image_processed = process(dataset[idx][0]) if process is not None else dataset[idx][0]
        plt.imshow(transforms.ToPILImage()(image_processed))
        plt.axis("off")
    
    plt.tight_layout()
    plt.show()

In [3]:
device = "cpu"
if torch.cuda.is_available:
  print('cuda available')
  device = torch.device("cuda:0")
else:
  print('Please set GPU via Edit -> Notebook Settings.')

cuda available


In [4]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [5]:
# 왜 vit32 사용했는지

model, preprocess = clip.load("ViT-B/32",jit=False) #loading the CLIP model based on ViT

In [6]:
model.cuda().eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [7]:
# 이미지 크기 : 224
input_resolution = model.visual.input_resolution
# 텍스트 입력 최대 길이 : 77
context_length = model.context_length
# 텍스트 관련 부분에서 사용되는 어휘 사전의 크기. 모델이 인식하 수 있는 고유의 단어 개수 : 49408
vocab_size = model.vocab_size

In [8]:
from torchvision.datasets import MNIST
import random


mnist = MNIST(os.path.expanduser("~/.cache"), train=True, transform=preprocess, download=False)
num_samples = 10000
selected_samples = random.sample(range(len(mnist)), num_samples)
mnist_subset = torch.utils.data.Subset(mnist, selected_samples)

len(mnist_subset)

10000

In [9]:
val_ratio = 0.5
test_ratio = 0.5

val_size = int(len(mnist_subset) * val_ratio)
test_size = len(mnist_subset) - val_size

val_set, test_set = torch.utils.data.random_split(mnist_subset, [val_size, test_size])

print("Number of samples in validation set:", len(val_set))
print("Number of samples in test set:", len(test_set))

Number of samples in validation set: 5000
Number of samples in test set: 5000


### Validation

In [None]:
val_image_input = torch.tensor(np.stack([val_set[x][0] for x in range(len(val_set))])).cuda()
# Label of validation dataset
val_labels = torch.tensor(np.stack([val_set[x][1] for x in range(len(val_set))]))

In [None]:
# test_image_input = torch.tensor(np.stack([test_set[x][0] for x in range(len(test_set))])).cuda()
# # Label of test dataset
# test_labels = torch.tensor(np.stack([test_set[x][1] for x in range(len(test_set))]))

In [None]:
with torch.no_grad():
    image_features = model.encode_image(val_image_input).float()

In [26]:
# text_descriptions = [f"a photo of a {label}" for label in mnist_subset.classes]
text_descriptions = [f"a photo of a {mnist_subset.dataset.classes[label]}" for _, label in mnist_subset]

In [27]:
text_tokens_ensembled = clip.tokenize(text_descriptions).cuda()
text_tokens = clip.tokenize([mnist_subset.dataset.classes[label] for _, label in mnist_subset]).to(device)

In [28]:
with torch.no_grad():
    text_features = model.encode_text(text_tokens).float()
    text_features /= text_features.norm(dim=-1, keepdim=True)
    text_features_ensembled = model.encode_text(text_tokens_ensembled).float()
    text_features_ensembled /= text_features_ensembled.norm(dim=-1, keepdim=True)
#the 100.0 works as temperature parameter, raising the softmax confidence 
text_probs_notens = (100.0 * image_features @ text_features.T).softmax(dim=-1)
text_probs = ( 100.0 * image_features @ text_features_ensembled.T).softmax(dim=-1)

top_probs, top_labels = text_probs.cpu().topk(3, dim=-1)
top_probs_n, top_labels_n = text_probs_notens.cpu().topk(3, dim=-1)

In [30]:
correct_labels = 0
correct_labels_not = 0
for index, prediction in enumerate(top_labels):
  if prediction[0]==val_labels[index]:
    correct_labels=correct_labels+1
for index, prediction in enumerate(top_labels_n):
  if prediction[0]==val_labels[index]:
    correct_labels_not=correct_labels_not+1    
print("The overall accuracy for the CLIP Zero shot model with ensembling is: {}".format((correct_labels/len(top_labels))))
print("The overall accuracy for the CLIP Zero shot model without ensembling is: {}".format((correct_labels_not/len(top_labels_n))))

print(correct_labels)
print(correct_labels_not)

The overall accuracy for the CLIP Zero shot model with ensembling is: 0.0578
The overall accuracy for the CLIP Zero shot model without ensembling is: 0.0182
289
91
