# TODO
- [ ] Add illustrations for the concept (dinov2, instance retrieval, faiss)
- [ ] There are a lot of possible optimizations I can apply for both search and encoding, but I don't have time for now
- [ ] Add a pipeline to give a nice caption according to the input image and the nearest image with NLP
- [ ] Upload it into HuggingFace space as a demo
- [ ] Save Faiss state

# References & useful resources


In [None]:
# !git clone https://github.com/facebookresearch/dinov2.git

In [1]:
# %cd /dinov2
# !pip install -r requirements.txt
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-3.41.2-py3-none-any.whl (20.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.1/20.1 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.103.0-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.5.0 (from gradio)
  Downloading gradio_client-0.5.0-py3-none-any.whl (298 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.2/298.2 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# !pip install faiss-gpu
import os
from PIL import Image
import torch
import torchvision.transforms as transforms
import faiss
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torchvision.datasets import ImageFolder
import warnings

warnings.filterwarnings("ignore")
import gradio as gr

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the DINO model
dino = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")
dino.to(device)
dino.eval()

# Define the image transformations
image_transforms = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.Grayscale(num_output_channels=3),  # Convert to RGB format
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)


# Function to extract features from an image
def extract_features(image):
    image_tensor = image_transforms(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = dino(image_tensor).float()
    return features


# Function to perform nearest neighbor search
def nearest_neighbor_search(query_features, dataset_features, k=10):
    index = faiss.IndexFlatL2(query_features.shape[1])
    dataset_features = torch.cat(dataset_features, dim=0).cpu().numpy()
    index.add(dataset_features)
    distances, indices = index.search(query_features.cpu().numpy(), k)
    return indices


# Function to return the nearest neighbor images
def return_nearest_neighbor_images(query_image):
    query_features = extract_features(query_image)
    indices = nearest_neighbor_search(query_features, dataset_features)
    nearest_neighbor_image_paths = []
    for i in range(k):
        image_path = os.path.join(dataset_path, dataset_images[indices[0][i]])
        nearest_neighbor_image_paths.append(image_path)
    return nearest_neighbor_image_paths


# Gradio interface
dataset_path = "/content/drive/MyDrive/newpaper/Hader/dataset/jeans/"
dataset_images = os.listdir(dataset_path)
dataset_features = []
for filename in dataset_images:
    image_path = os.path.join(dataset_path, filename)
    if (
        filename.endswith(".png")
        or filename.endswith(".jpg")
        or filename.endswith(".jpeg")
    ):
        image = Image.open(image_path)
        features = extract_features(image)
        dataset_features.append(features)

k = 5  # Number of nearest neighbors to retrieve

inputs = gr.inputs.Image(type="pil", label="Input Image")
# outputs = gr.outputs.Image(type="pil", label="Returned Images", multiple=True)

outputs = gr.Gallery()

demo = gr.Interface(
    fn=return_nearest_neighbor_images,
    inputs=inputs,
    outputs=outputs,
    title="Recommendation Fusion",
    description="Please, uploading an image to  get the recommendation images.",
)

demo.launch()

Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

