# Basic 3DINO-ViT Usage Example

This notebook shows a basic example of how the pretrained 3DINO-ViT model could be used to extract features from an input image. Change all paths to your local ones!

In [12]:
import sys
sys.path.append('../')  # adjust this to your local path
from dinov2.eval.setup import build_model_for_eval
from dinov2.configs import load_and_merge_config_3d
import torch

In [13]:
# use config and path to pretrained weights to load the pretrained 3DINO-ViT model
config_file = 'train/vit3d_highres'
pretrained_weights = 'path_to_pretrained_weights'  # adjust this to local path

cfg = load_and_merge_config_3d(config_file)
model = build_model_for_eval(cfg, pretrained_weights)

print(model)

[Errno 2] No such file or directory: 'path_to_pretrained_weights'
No weights found, using random initialization!
DinoVisionTransformer3d(
  (patch_embed): PatchEmbed3d(
    (proj): Conv3d(1, 1024, kernel_size=(16, 16, 16), stride=(16, 16, 16))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0): BlockChunk(
      (0-5): 6 x NestedTensorBlock(
        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (attn): MemEffAttention(
          (qkv): Linear(in_features=1024, out_features=3072, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (act): GELU(approximate='none')
          (fc2)

In [16]:
# the minimal preprocessing of the input image should be normalizing it to have values ranging between -1 and 1
# shape is batch size, channels, and spatial dims
example_img = torch.randn(1, 1, 112, 112, 112).cuda()

# for example: 
# normalize 99.95% percentile to 1 and 0.05% percentile to -1, then clip to -1, 1
min_val = torch.quantile(example_img, 0.0005)
max_val = torch.quantile(example_img, 0.9995)
example_img = (example_img - min_val) / (max_val - min_val)
example_img = torch.clip(example_img * 2 - 1, -1, 1)

print(example_img.max(), example_img.min())

out = model(example_img)

tensor(1., device='cuda:0') tensor(-1., device='cuda:0')


In [17]:
# output is a feature vector of size 1024
print(out.shape)

torch.Size([1, 1024])
