<a href="https://colab.research.google.com/github/komazawa-deep-learning/komazawa-deep-learning.github.io/blob/master/2024notebooks/2024_1030embedding_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###  [Vision Embedding Comparison for Image Similarity Search: EfficientNet vs. ViT vs. VINO vs. CLIP vs. BLIP2](https://gist.github.com/tanukon/00d689478ee3f7d2abd0366f1352cf9d)
* source: https://gist.github.com/tanukon/00d689478ee3f7d2abd0366f1352cf9d

In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(device)

try:
    import faiss
except ImportError:
    !pip install faiss-cpu
    import faiss

cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#!ls -l "drive/Shareddrives/#2024認知心理学研究(1)b/浅川先生/Flickr30k"
#!unzip "drive/Shareddrives/#2024認知心理学研究(1)b/浅川先生/Flickr30k/flickr30k_images.zip"
#!mkdir "drive/Shareddrives/#2024認知心理学研究(1)b/浅川先生/Flickr30k/images"
#!mv *.jpg "drive/Shareddrives/#2024認知心理学研究(1)b/浅川先生/Flickr30k/images"
#!ls "drive/Shareddrives/#2024認知心理学研究(1)b/浅川先生/Flickr30k/images/" | head

import os
HOME = os.environ['HOME']
print(HOME)

Mounted at /content/drive
/root


In [None]:
#import faiss
#import os
import numpy as np
import pandas as pd
import torch

from PIL import Image
from tqdm import tqdm
from transformers import AutoImageProcessor, EfficientNetModel, ViTModel, AutoModel, CLIPProcessor, CLIPModel, Blip2Processor, Blip2Model

from transformers import Pipeline
from transformers.image_utils import load_image

In [None]:
# dataset directory for flickr30k dataset
#dataset_dir = '<Your dataset directory>'
dataset_dir = "drive/Shareddrives/#2024認知心理学研究(1)b/浅川先生/Flickr30k/images"

In [None]:
test_image = Image.open(os.path.join(dataset_dir, '36979.jpg'))
print(test_image.size)

(500, 375)


## EfficientNet feature extraction

In [None]:
# load pre-trained image processor for efficientnet-b7 and model weight
image_processor = AutoImageProcessor.from_pretrained("google/efficientnet-b7")
model = EfficientNetModel.from_pretrained("google/efficientnet-b7")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/495 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/70.2k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/267M [00:00<?, ?B/s]

In [None]:
# prepare input image
inputs = image_processor(test_image, return_tensors='pt')
print('input shape: ', inputs['pixel_values'].shape)

# inference
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)

embedding = outputs.hidden_states[-1]
print('embedding shape: ', embedding.shape)

embedding = torch.mean(embedding, dim=[2,3])
print('after reducing: ', embedding.shape)

input shape:  torch.Size([1, 3, 600, 600])
embedding shape:  torch.Size([1, 640, 19, 19])
after reducing:  torch.Size([1, 640])


## ViT feature extraction

In [None]:
# load pre-trained image processor for ViT-large and model weight
image_processor = AutoImageProcessor.from_pretrained("google/vit-large-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-large-patch16-224-in21k")

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

In [None]:
# prepare input image
inputs = image_processor(test_image, return_tensors='pt')
print('input shape: ', inputs['pixel_values'].shape)

with torch.no_grad():
    outputs = model(**inputs)

embedding = outputs.last_hidden_state
embedding = embedding[:, 0, :].squeeze(1)
print('embedding shape: ', embedding.shape)

input shape:  torch.Size([1, 3, 224, 224])
embedding shape:  torch.Size([1, 1024])


## DINO-v2 feature extraction

In [None]:
# load pre-trained image processor for DINO-v2 and model weight
image_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
model = AutoModel.from_pretrained('facebook/dinov2-base')

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
# prepare input image
inputs = image_processor(images=test_image, return_tensors='pt')
print('input shape: ', inputs['pixel_values'].shape)

with torch.no_grad():
    outputs = model(**inputs)

embedding = outputs.last_hidden_state
embedding = embedding[:, 0, :].squeeze(1)
print('embedding shape: ', embedding.shape)

input shape:  torch.Size([1, 3, 224, 224])
embedding shape:  torch.Size([1, 768])


## CLIP feature extraction

In [None]:
# load pre-trained image processor for CLIP and model weight
image_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [None]:
# prepare input image
inputs = image_processor(images=test_image, return_tensors='pt', padding=True)
print('input shape: ', inputs['pixel_values'].shape)

with torch.no_grad():
    outputs = model.get_image_features(**inputs)

print('embedding shape: ', outputs.shape)

input shape:  torch.Size([1, 3, 224, 224])
embedding shape:  torch.Size([1, 512])


## BLIP-2 feature extraction

In [None]:
image_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

In [None]:
# prepare input image
inputs = image_processor(images=test_image, return_tensors='pt', padding=True)
print('input shape: ', inputs['pixel_values'].shape)

with torch.no_grad():
    outputs = model.get_qformer_features(**inputs)
    outputs = outputs.last_hidden_state

embedding = torch.mean(outputs, dim=1).squeeze(1)
print('after reducing: ', embedding.shape)

## Image similarity search

### custom pipeline for EfficientNet

In [None]:
class EfficientNetPipeline(Pipeline):

    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def preprocess(self, image):
        image = load_image(image)
        model_inputs = self.image_processor(images=image, return_tensors="pt")

        return model_inputs

    def _forward(self, model_inputs):
        with torch.no_grad():
            outputs = self.model(**model_inputs, output_hidden_states=True)

        return outputs

    def postprocess(self, model_outputs):
        embedding = model_outputs.hidden_states[-1]
        embedding = torch.mean(embedding, dim=[2,3])

        return embedding

### custom pipeline for ViT

In [None]:
class ViTPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def preprocess(self, image):
        image = load_image(image)
        model_inputs = self.image_processor(images=image, return_tensors="pt")
        return model_inputs

    def _forward(self, model_inputs):
        with torch.no_grad():
            outputs = self.model(**model_inputs)

        return outputs

    def postprocess(self, model_outputs):
        embedding = model_outputs.last_hidden_state
        embedding = embedding[:, 0, :].squeeze(1)

        return embedding


### custom pipeline for DINO-v2

In [None]:
class DINOv2Pipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def preprocess(self, image):
        image = load_image(image)
        model_inputs = self.image_processor(images=image, return_tensors="pt")

        return model_inputs

    def _forward(self, model_inputs):
        with torch.no_grad():
            outputs = self.model(**model_inputs)

        return outputs

    def postprocess(self, model_outputs):
        embedding = model_outputs.last_hidden_state
        embedding = embedding[:, 0, :].squeeze(1)

        return embedding

### custom pipeline for CLIP

In [None]:
class CLIPPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def preprocess(self, image):
        image = load_image(image)
        model_inputs = self.image_processor(images=image, return_tensors="pt")

        return model_inputs

    def _forward(self, model_inputs):
        with torch.no_grad():
            outputs = self.model.get_image_features(**model_inputs)

        return outputs

    def postprocess(self, model_outputs):

        return model_outputs


### custom pipeline for BLIP2

In [None]:
class BLIP2Pipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def preprocess(self, image):
        image = load_image(image)
        model_inputs = self.image_processor(images=image, return_tensors="pt")

        return model_inputs

    def _forward(self, model_inputs):
        with torch.no_grad():
            outputs = self.model.get_qformer_features(**model_inputs)

        return outputs

    def postprocess(self, model_outputs):
        embedding = model_outputs.last_hidden_state
        embedding = torch.mean(embedding, dim=1).squeeze(1)

        return embedding

In [None]:
def define_model(model_name: str):
    if model_name == 'EfficientNet':
        image_processor = AutoImageProcessor.from_pretrained("google/efficientnet-b7")
        model = EfficientNetModel.from_pretrained("google/efficientnet-b7")
    elif model_name == 'ViT':
        image_processor = AutoImageProcessor.from_pretrained("google/vit-large-patch16-224-in21k")
        model = ViTModel.from_pretrained("google/vit-large-patch16-224-in21k")
    elif model_name == 'DINO-v2':
        image_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
        model = AutoModel.from_pretrained('facebook/dinov2-base')
    elif model_name == 'CLIP':
        image_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    elif model_name == 'BLIP2':
        image_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
        model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)

    return image_processor, model

In [None]:
def load_pipeline(model_name: str):
    image_processor, model = define_model(model_name=model_name)

    if model_name == 'EfficientNet':
        pipeline = EfficientNetPipeline(model=model, image_processor=image_processor, device=device)
    elif model_name == 'ViT':
        pipeline = ViTPipeline(model=model, image_processor=image_processor, device=device)
    elif model_name == 'DINO-v2':
        pipeline = DINOv2Pipeline(model=model, image_processor=image_processor, device=device)
    elif model_name == 'CLIP':
        pipeline = CLIPPipeline(model=model, image_processor=image_processor, device=device)
    elif model_name == 'BLIP2':
        pipeline = BLIP2Pipeline(model=model, image_processor=image_processor, device=device)

    return pipeline

In [None]:
def register_embeddings(embeddings):
    vector_dim = embeddings.shape[1]

    index = faiss.IndexFlatIP(vector_dim)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    return index

def image_similarity_search(embeddings, index, image_name_list, model_name, result_dir, top_k=6):

    result_dict = {
        'top0_similar': [],
        'top1_similar': [],
        'top2_similar': [],
        'top3_similar': [],
        'top4_similar': [],
        'top5_similar': []
    }

    for embed in embeddings:
        embed = embed.reshape(1, -1)
        faiss.normalize_L2(embed)
        distances, ann = index.search(embed, k=top_k)

        for k in range(top_k):
            idx = ann[0][k]

            result_dict[f'top{str(k)}_similar'].append(image_name_list[idx])

    df = pd.DataFrame.from_dict(result_dict)
    df.to_csv(os.path.join(result_dir, f'{model_name}.csv'), index=None)

In [None]:

def collect_data(dataset_dir):
    files_list = []

    for root, _, files in os.walk(dataset_dir):
        if len(files) > 0:
            for f in files:
                if 'jpg' in f:
                    # append an image file path
                    filepath = os.path.join(root, f)
                    files_list.append(filepath)

    return files_list

In [None]:
model_name = 'ViT'
batch_size = 16

#dataset_dir = '<Your dataset directory>'
#result_dir = './results_test'
dataset_dir = "drive/Shareddrives/#2024認知心理学研究(1)b/浅川先生/Flickr30k/images"
result_dir =  "drive/Shareddrives/#2024認知心理学研究(1)b/浅川先生/Flickr30k/results_test"
os.makedirs(result_dir, exist_ok=True)

In [None]:
# load pipeline
pipeline = load_pipeline(model_name=model_name)

# load dataset
dataset = collect_data(dataset_dir=dataset_dir)

if os.path.exists(os.path.join(result_dir, f'{model_name}.npy')):
    embeddings = np.load(os.path.join(result_dir, f'{model_name}.npy'))
    print('embedding shape: ', embeddings.shape)
else:
    # result embedding list
    embeddings = []

    for idx in tqdm(range(0, len(dataset), batch_size)):

        data = dataset[idx: idx + batch_size]

        out = pipeline(data, batch_size=batch_size)
        embeddings += [embed.detach().cpu() for embed in out]

        del out
        torch.cuda.empty_cache()

    embeddings = torch.cat(embeddings, dim=0).detach().cpu().numpy()
    print('embedding shape: ', embeddings.shape)
    np.save(os.path.join(result_dir, f'{model_name}.npy'), embeddings)

# similarity search by Faiss
embeddings = embeddings.astype(np.float32)
index = register_embeddings(embeddings)
image_similarity_search(embeddings=embeddings, index=index, image_name_list=dataset, model_name=model_name, result_dir=result_dir)

# References

* [1] Mingxing Tan, Quoc V. Le, EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks (2019), [Arxiv](https://arxiv.org/pdf/1905.11946)
* [2] Alexey Dosovitskiy, et al., AN IMAGE IS WORTH 16X16 WORDS: TRANSFORMERS FOR IMAGE RECOGNITION AT SCALE (2020), [Arxiv](https://arxiv.org/pdf/2010.11929)
* [3] Maxime Oquab, Timothée Darcet, Théo Moutakanni, et.al., DINOv2: Learning Robust Visual Features without Supervision (2023), [Arxiv](https://arxiv.org/pdf/2304.07193)
* [4] Radford, A., Kim, J., et.al., Learning Transferable Visual Models From Natural Language Supervision (2023), [arxiv](https://arxiv.org/pdf/2103.00020)
* [5] Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi, BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models (2023), [Arxiv](https://arxiv.org/pdf/2301.12597)
* [6] Peter Young, Alice Lai, Micah Hodosh, Julia Hockenmaier, From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions (2014), [MIT Press](https://aclanthology.org/Q14-1006.pdf)
* [7] Faiss, [Meta](https://ai.meta.com/tools/faiss/)