# DINOv2

In [1]:
pip install torch torchvision transformers timm accelerate


Collecting torchvision
  Downloading torchvision-0.23.0-cp313-cp313-win_amd64.whl.metadata (6.1 kB)
Collecting timm
  Downloading timm-1.0.20-py3-none-any.whl.metadata (61 kB)
Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Downloading torchvision-0.23.0-cp313-cp313-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 1.6/1.6 MB 36.0 MB/s eta 0:00:00
Downloading timm-1.0.20-py3-none-any.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ---------------------------------------- 2.5/2.5 MB 49.9 MB/s eta 0:00:00
Downloading accelerate-1.10.1-py3-none-any.whl (374 kB)
Installing collected packages: torchvision, timm, accelerate

   ---------------------------------------- 0/3 [torchvision]
   ---------------------------------------- 0/3 [torchvision]
   ---------------------------------------- 0/3 [torchvision]
   ---------------------

In [2]:
from transformers import AutoImageProcessor, AutoModel
import torch
from PIL import Image
import requests
import numpy as np
import os


## Compare embedding similarity between two objects

In [5]:
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = AutoModel.from_pretrained("facebook/dinov2-base")

def get_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Average the patch embeddings
    emb = outputs.last_hidden_state.mean(dim=1)
    return emb / emb.norm()  # normalize

# Example images
emb1 = get_embedding("data\detected_['tree']_0.663.jpg")
emb2 = get_embedding("data\detected_['tree']_0.663.jpg")

similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
print("Cosine similarity:", similarity.item())


Cosine similarity: 1.0


## Compare against reference species embeddings

In [11]:
from transformers import AutoImageProcessor, AutoModel
from PIL import Image, ImageFile
import torch
import os
import numpy as np

# Handle truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Load DINOv2
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = AutoModel.from_pretrained("facebook/dinov2-base").eval()

def get_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    emb = outputs.last_hidden_state.mean(dim=1)
    emb = emb / emb.norm()  # normalize
    return emb.cpu()

# -----------------------------------------------------------------
# 1️⃣ Build reference library (your known tree species)
# Folder structure example:
# data/
#   meranti_1.jpg, meranti_2.jpg
#   angsana_1.jpg, angsana_2.jpg
#   palm_1.jpg, palm_2.jpg
# -----------------------------------------------------------------

reference_dir = "data/"
species_embeddings = {}

for fname in os.listdir(reference_dir):
    if fname.endswith((".jpg", ".png")):
        species = fname.split(".")[0] 
        emb = get_embedding(os.path.join(reference_dir, fname))
        species_embeddings.setdefault(species, []).append(emb)

# Average embeddings per species
for species in species_embeddings:
    species_embeddings[species] = torch.stack(species_embeddings[species]).mean(dim=0)
    species_embeddings[species] = species_embeddings[species] / species_embeddings[species].norm()

# -----------------------------------------------------------------
# 2️⃣ Classify a new (unknown) tree image
# -----------------------------------------------------------------
test_image = "tree.jpeg"
test_emb = get_embedding(test_image)

# Compute cosine similarity with each species reference
scores = {}
for species, ref_emb in species_embeddings.items():
    sim = torch.nn.functional.cosine_similarity(test_emb, ref_emb)
    scores[species] = sim.item()

# Pick best match
pred_species = max(scores, key=scores.get)

print("Predicted species:", pred_species)
print("Similarity scores:", scores)


Predicted species: angsana
Similarity scores: {'angsana': 0.7849617004394531, 'jambulaut': 0.47202053666114807, 'mempari': 0.5361167788505554, 'pulai': 0.6082491278648376}


### Measuring accuracy

In [3]:
from transformers import AutoImageProcessor, AutoModel
import torch
from PIL import Image, ImageFile

# Handle truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Load DINOv2 model and processor
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = AutoModel.from_pretrained("facebook/dinov2-base").eval()

def get_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    emb = outputs.last_hidden_state.mean(dim=1)
    emb = emb / emb.norm()  # normalize
    return emb.cpu()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [12]:
# Create reference embeddings for each species
reference_dir = "data/"
species_embeddings = {}

for fname in os.listdir(reference_dir):
    if fname.endswith((".jpg", ".png")):
        species = fname.split(".")[0] 
        emb = get_embedding(os.path.join(reference_dir, fname))
        species_embeddings.setdefault(species, []).append(emb)

# Average embeddings per species
for species in species_embeddings:
    species_embeddings[species] = torch.stack(species_embeddings[species]).mean(dim=0)
    species_embeddings[species] = species_embeddings[species] / species_embeddings[species].norm()

print("Created reference embeddings for species:", list(species_embeddings.keys()))

Created reference embeddings for species: ['angsana', 'jambulaut', 'pulai']


In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
import os

# Prepare lists for evaluation
y_true, y_pred = [], []

test_dir = "test/"
for fname in os.listdir(test_dir):
    if fname.endswith((".jpg", ".png")):
        true_species = fname.split("_")[0]  # extract label from filename
        test_image = os.path.join(test_dir, fname)
        test_emb = get_embedding(test_image)

        # Compare with reference embeddings
        scores = {}
        for species, ref_emb in species_embeddings.items():
            sim = torch.nn.functional.cosine_similarity(test_emb, ref_emb)
            scores[species] = sim.item()
        pred_species = max(scores, key=scores.get)

        y_true.append(true_species)
        y_pred.append(pred_species)

# Compute accuracy
acc = accuracy_score(y_true, y_pred)
print(f"Overall accuracy: {acc*100:.2f}%")

# Optional: confusion matrix
cm = pd.DataFrame(confusion_matrix(y_true, y_pred),
                    index=species_embeddings.keys(),
                    columns=species_embeddings.keys())
print("\nConfusion Matrix:")
print(cm)


Overall accuracy: 71.43%

Confusion Matrix:
           angsana  jambulaut  pulai
angsana          2          0      0
jambulaut        0          2      0
pulai            1          1      1
