In [1]:
import torch
import torchvision.transforms as T
from PIL import Image
import os
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cpu


In [2]:
# Load DINOv2 model (ViT-B/14)
model = torch.hub.load(
    "facebookresearch/dinov2",
    "dinov2_vitb14",
    pretrained=True
)

model = model.to(device)
model.eval()


Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to C:\Users\LENOVO/.cache\torch\hub\main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth" to C:\Users\LENOVO/.cache\torch\hub\checkpoints\dinov2_vitb14_pretrain.pth
100%|██████████| 330M/330M [00:31<00:00, 11.0MB/s] 


DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (norm): LayerNorm((768,), eps=1e-06, elementwise_affi

In [3]:
transform = T.Compose([
    T.Resize(224),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(
        mean=(0.485, 0.456, 0.406),
        std=(0.229, 0.224, 0.225),
    ),
])


In [7]:
DATASET_DIR = "../imc_25/image-matching-challenge-2025/train/imc2024_lizard_pond"
image_names = os.listdir(DATASET_DIR)
embeddings = []
embedding_names = []

with torch.no_grad():
    for name in image_names:
        if not name.lower().endswith((".png", ".jpg", ".jpeg")):
            continue   # safety check

        path = os.path.join(DATASET_DIR, name)
        img = Image.open(path).convert("RGB")
        img_tensor = transform(img).unsqueeze(0).to(device)

        feat = model(img_tensor)                      # [1, D]
        feat = torch.nn.functional.normalize(feat, dim=1)

        embeddings.append(feat.cpu().numpy()[0])
        embedding_names.append(name)

embeddings = np.stack(embeddings)

print("Embeddings shape:", embeddings.shape)



Embeddings shape: (214, 768)


In [8]:
similarity_matrix = embeddings @ embeddings.T


In [12]:
K = 15  # number of neighbors to retrieve
idx = 0  # reference image index

sims = similarity_matrix[idx]
topk_idx = np.argsort(-sims)[1:K+1]

print("Reference image:", embedding_names[idx])
print("Top-K neighbors:")
for i in topk_idx:
    print(f"  {embedding_names[i]}  similarity={sims[i]:.3f}")


Reference image: lizard_00003.png
Top-K neighbors:
  lizard_00527.png  similarity=0.722
  lizard_00698.png  similarity=0.667
  lizard_00544.png  similarity=0.628
  lizard_00538.png  similarity=0.607
  lizard_00051.png  similarity=0.601
  lizard_00531.png  similarity=0.596
  lizard_00519.png  similarity=0.590
  lizard_00233.png  similarity=0.573
  lizard_00013.png  similarity=0.572
  lizard_00107.png  similarity=0.566
  lizard_00516.png  similarity=0.551
  lizard_00226.png  similarity=0.541
  lizard_00160.png  similarity=0.541
  lizard_00512.png  similarity=0.541
  lizard_00034.png  similarity=0.537


In [16]:
import numpy as np

K = 15  # start with 30 for lizard_pond
neighbors = {}

for i, name in enumerate(embedding_names):
    sims = similarity_matrix[i]
    # exclude self, take top-K
    topk = np.argsort(-sims)[1:K+1]
    neighbors[name] = [embedding_names[j] for j in topk]

# quick check
some = embedding_names[0]
print("Example image:", some)
print("Top neighbors:", neighbors[some][:10])


Example image: lizard_00003.png
Top neighbors: ['lizard_00527.png', 'lizard_00698.png', 'lizard_00544.png', 'lizard_00538.png', 'lizard_00051.png', 'lizard_00531.png', 'lizard_00519.png', 'lizard_00233.png', 'lizard_00013.png', 'lizard_00107.png']


In [17]:
pair_set = set()

for a, nbrs in neighbors.items():
    for b in nbrs:
        if a == b:
            continue
        pair = tuple(sorted((a, b)))
        pair_set.add(pair)

pairs = sorted(list(pair_set))
print("Number of candidate pairs:", len(pairs))
print("First 5 pairs:", pairs[:5])


Number of candidate pairs: 2113
First 5 pairs: [('lizard_00003.png', 'lizard_00013.png'), ('lizard_00003.png', 'lizard_00034.png'), ('lizard_00003.png', 'lizard_00051.png'), ('lizard_00003.png', 'lizard_00087.png'), ('lizard_00003.png', 'lizard_00107.png')]
