In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import uuid
from dotenv import load_dotenv
import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn.functional as F
import pinecone
from pinecone import Pinecone, ServerlessSpec
from PIL import Image

# Vectorization function import
import importlib
from img2vec import img2vec



In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.225, 0.225, 0.225])
])

In [5]:
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

100.0%


In [6]:
vec_dimensions = img2vec(train_dataset[0][0]).shape[0]

In [7]:
vec_dimensions

2048

In [10]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = f"cifar100-index-{vec_dimensions}"

# Index has already been created!
# pc.create_index(
#   name=index_name,
#   dimension=vec_dimensions,
#   metric="cosine",
#   spec=ServerlessSpec(
#     cloud="aws",
#     region="us-east-1"
#   )
# )

cifar_index = pc.Index(index_name)

In [11]:
cifar_index.describe_index_stats()

{'dimension': 2048,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [None]:
#cifar_index.delete(delete_all=True)

In [21]:
cifar100_super_classes = ["aquatic mammals", "fish", "flowers", "food containers", "fruit and vegetables", "household electrical devices", "household furniture", "insects", "large carnivores", "large man-made outdoor things", "large natural outdoor scenes", "large omnivores and herbivores", "medium-sized mammals", "non-insect invertebrates", "people", "reptiles", "small mammals", "trees", "vehicles 1", "vehicles 2"]
cifar100_classes = ["beaver", "dolphin", "otter", "seal", "whale", "aquarium fish", "flatfish", "ray", "shark", "trout", "orchids", "poppies", "roses", "sunflowers", "tulips", "bottles", "bowls", "cans", "cups", "plates", "apples", "mushrooms", "oranges", "pears", "sweet peppers", "clock", "computer keyboard", "lamp", "telephone", "television", "bed", "chair", "couch", "table", "wardrobe", "bee", "beetle", "butterfly", "caterpillar", "cockroach", "bear", "leopard", "lion", "tiger", "wolf", "bridge", "castle", "house", "road", "skyscraper", "cloud", "forest", "mountain", "plain", "sea", "camel", "cattle", "chimpanzee", "elephant", "kangaroo", "fox", "porcupine", "possum", "raccoon", "skunk", "crab", "lobster", "snail", "spider", "worm", "baby", "boy", "girl", "man", "woman", "crocodile", "dinosaur", "lizard", "snake", "turtle", "hamster", "mouse", "rabbit", "shrew", "squirrel", "maple", "oak", "palm", "pine", "willow", "bicycle", "bus", "motorcycle", "pickup truck", "train", "lawn-mower", "rocket", "streetcar", "tank", "tractor"]

In [None]:
vectors_by_classes = {}

for i in range(len(cifar100_classes)):
  num_vecs = 0
  vectors_by_classes[i] = []

  for j in range(len(train_dataset)):
    if num_vecs == 300:
      break

    if train_dataset[j][1] == i:
      num_vecs += 1
      vectors_by_classes[i].append(img2vec(train_dataset[j][0]))
      print(f"Added vector {len(vectors_by_classes[i])} in class {i}")

Added vector 1 in class 0
Added vector 2 in class 0
Added vector 3 in class 0
Added vector 4 in class 0
Added vector 5 in class 0
Added vector 6 in class 0
Added vector 7 in class 0
Added vector 8 in class 0
Added vector 9 in class 0
Added vector 10 in class 0
Added vector 11 in class 0
Added vector 12 in class 0
Added vector 13 in class 0
Added vector 14 in class 0
Added vector 15 in class 0
Added vector 16 in class 0
Added vector 17 in class 0
Added vector 18 in class 0
Added vector 19 in class 0
Added vector 20 in class 0
Added vector 21 in class 0
Added vector 22 in class 0
Added vector 23 in class 0
Added vector 24 in class 0
Added vector 25 in class 0
Added vector 26 in class 0
Added vector 27 in class 0
Added vector 28 in class 0
Added vector 29 in class 0
Added vector 30 in class 0
Added vector 31 in class 0
Added vector 32 in class 0
Added vector 33 in class 0
Added vector 34 in class 0
Added vector 35 in class 0
Added vector 36 in class 0
Added vector 37 in class 0
Added vect

In [None]:
# Upserting hypervectors

for c in vectors_by_classes:
    for i in range(len(vectors_by_classes[c])):
        cifar_index.upsert(vectors=[(str(uuid.uuid4()), vectors_by_classes[c][i], {"class": c})])
        print(f"Upserted vector {i+1}/450 in class {c}")

In [None]:
idx = 500

test_query_vector = img2vec(test_dataset[idx][0])
test_label = test_dataset[idx][1]

response = cifar_index.query(
    namespace="",
    vector=test_query_vector.tolist(),
    top_k=5,
    include_values=False,
    include_metadata=True,
)

counts = {}

for vec in response["matches"]:
    counts[int(vec["metadata"]["class"])] = counts.get(vec["metadata"]["class"], 0) + 1 

max(counts, key=counts.get)

In [None]:
test_label

In [None]:
accuracy = 0
test_dataset_size = len(test_dataset)

for i in range(len(test_dataset)):
    test_query_vector = img2vec(test_dataset[i][0])
    label = test_dataset[i][1]

    response = cifar_index.query(
        namespace="",
        vector=test_query_vector.tolist(),
        top_k=5,
        include_values=False,
        include_metadata=True,
    )

    counts = {}
    
    for vec in response["matches"]:
        counts[int(vec["metadata"]["class"])] = counts.get(vec["metadata"]["class"], 0) + 1 
    
    pred = max(counts, key=counts.get)

    accuracy += 1 if (pred == label) else 0

    if i % 10 == 0:
        print(f"Iteration {i} accuracy: {accuracy / (i+1)}")

print()
print()
print()
print()
print(f"Accuracy: {accuracy / test_dataset_size:.4f}")

In [None]:
print(f"Accuracy: {(accuracy / test_dataset_size)*100:.2f}%")