In [1]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment1/'
FOLDERNAME = "/CS197"
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

####### TODO: Download the CIFAR-10 dataset

Mounted at /content/drive


In [2]:
import subprocess

CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

CUDA version: 11.0


In [3]:
! pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.7.1+cu110
[?25l  Downloading https://download.pytorch.org/whl/cu110/torch-1.7.1%2Bcu110-cp37-cp37m-linux_x86_64.whl (1156.8MB)
[K     |███████████████████████         | 834.1MB 1.3MB/s eta 0:04:07tcmalloc: large alloc 1147494400 bytes == 0x55fcef5d0000 @  0x7f179c8b2615 0x55fcb5ff806c 0x55fcb60d7eba 0x55fcb5ffae8d 0x55fcb60ec99d 0x55fcb606efe9 0x55fcb6069b0e 0x55fcb5ffc77a 0x55fcb606ee50 0x55fcb6069b0e 0x55fcb5ffc77a 0x55fcb606b86a 0x55fcb60ed7c6 0x55fcb606aee2 0x55fcb60ed7c6 0x55fcb606aee2 0x55fcb60ed7c6 0x55fcb606aee2 0x55fcb60ed7c6 0x55fcb616f431 0x55fcb60d0049 0x55fcb603ac84 0x55fcb5ffb8e9 0x55fcb606fade 0x55fcb5ffc69a 0x55fcb606aa45 0x55fcb6069e0d 0x55fcb5ffc77a 0x55fcb606aa45 0x55fcb5ffc69a 0x55fcb606aa45
[K     |█████████████████████████████▏  | 1055.7MB 1.2MB/s eta 0:01:27tcmalloc: large alloc 1434370048 bytes == 0x55fd33c26000 @  0x7f179c8b2615 0x55fcb5ff806c 0x55fcb60d7eba 0x55fcb5ffae

In [4]:
import numpy as np
import torch

print("Torch version:", torch.__version__)

Torch version: 1.7.1+cu110


In [5]:
MODELS = {
    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",    
}

In [6]:
! wget {MODELS["ViT-B/32"]} -O model.pt

--2021-04-26 19:10:24--  https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt
Resolving openaipublic.azureedge.net (openaipublic.azureedge.net)... 13.107.246.40, 13.107.213.40, 2620:1ec:bdf::40, ...
Connecting to openaipublic.azureedge.net (openaipublic.azureedge.net)|13.107.246.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 353976522 (338M) [application/octet-stream]
Saving to: ‘model.pt’


2021-04-26 19:10:25 (255 MB/s) - ‘model.pt’ saved [353976522/353976522]



In [7]:
model = torch.jit.load("model.pt").cuda().eval()
input_resolution = model.input_resolution.item()
context_length = model.context_length.item()
vocab_size = model.vocab_size.item()

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [8]:
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
from torchvision.datasets import CIFAR10, CIFAR100
from utils import *
from prototype import Prototype
from prototypevector import PrototypeVector

image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).cuda()
image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).cuda()
##### IMPORTANT!!!!! MAY NEED TO CHANGE THIS!!!!! #######

In [9]:
! pip install ftfy regex
! wget https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz -O bpe_simple_vocab_16e6.txt.gz

--2021-04-26 19:10:34--  https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz
Resolving openaipublic.azureedge.net (openaipublic.azureedge.net)... 13.107.246.40, 13.107.213.40, 2620:1ec:bdf::40, ...
Connecting to openaipublic.azureedge.net (openaipublic.azureedge.net)|13.107.246.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1356917 (1.3M) [application/octet-stream]
Saving to: ‘bpe_simple_vocab_16e6.txt.gz’


2021-04-26 19:10:34 (74.6 MB/s) - ‘bpe_simple_vocab_16e6.txt.gz’ saved [1356917/1356917]



In [10]:
import os
import skimage #Has some images in here - check original "Interacting with CLIP.ipynb" document
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

from collections import OrderedDict
import torch

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [11]:
tokenizer = SimpleTokenizer()

In [12]:
def encodeTextWithTokenizer(tokenizer, texts):
  text_tokens = [tokenizer.encode("This is " + desc) for desc in texts]
  text_input = torch.zeros(len(text_tokens), model.context_length, dtype=torch.long)
  sot_token = tokenizer.encoder['<|startoftext|>']
  eot_token = tokenizer.encoder['<|endoftext|>']
  for i, tokens in enumerate(text_tokens):
      tokens = [sot_token] + tokens + [eot_token]
      text_input[i, :len(tokens)] = torch.tensor(tokens)

  text_input = text_input.cuda()
  print(text_input.shape)
  return text_input

In [13]:
def encodeImageAndTextInModel(model, imageInput=None, textInput=None):
  image_features, text_features = None, None
  with torch.no_grad():
    if imageInput:
      image_features = model.encode_image(imageInput).float()
    if textInput:
      text_features = model.encode_text(textInput).float()
  return image_features, text_features

In [14]:
def encodeImageAndTextInModel(model, imageInputOne=None, imageInputTwo=None):
  image_features_one, image_features_two = None, None
  with torch.no_grad():
    if imageInput:
      image_features_one = model.encode_image(imageInputOne).float()
    if textInput:
      image_features_two = model.encode_image(imageInputTwo).float()
  return image_features_one, image_features_two