In [7]:
import torch
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
model = AutoModel.from_pretrained('facebook/dinov2-base')

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
# last_hidden_states = outputs[0]

# # We have to force return_dict=False for tracing
# model.config.return_dict = False

# with torch.no_grad():
#     traced_model = torch.jit.trace(model, [inputs.pixel_values])
#     traced_outputs = traced_model(inputs.pixel_values)

# print((last_hidden_states - traced_outputs[0]).abs().max())

In [10]:
inputs.pixel_values.shape

torch.Size([1, 3, 224, 224])

In [12]:
torch.equal(outputs.last_hidden_state, outputs[0])

True

In [17]:
tmp_lhs = outputs.last_hidden_state

In [21]:
torch.equal(tmp_lhs, outputs.hidden_states[-3])

False

In [46]:
torch.equal(outputs.hidden_states[-9], outputs.last_hidden_state)

False

In [41]:
outputs.last_hidden_state

tensor([[[-2.1747, -0.4729,  1.0936,  ...,  0.2041,  1.1101,  0.1363],
         [-3.2780, -0.8269, -0.9210,  ...,  1.4415, -0.5364, -0.8757],
         [-2.9129,  1.1284, -0.7306,  ...,  0.6959, -1.8791, -2.3638],
         ...,
         [-0.5463,  1.4382, -0.2564,  ...,  0.1874, -2.9950,  0.4068],
         [-3.0848,  2.0568,  1.5137,  ...,  0.9157, -2.7059,  2.2017],
         [-0.7499,  0.0902,  1.3731,  ..., -0.2961, -2.3682, -0.1329]]],
       grad_fn=<NativeLayerNormBackward0>)

In [40]:
outputs.hidden_states[-1]

tensor([[[-1.4676, -0.2195,  0.3422,  ...,  0.6307,  0.4981, -0.4416],
         [-1.7904,  0.1015, -0.0225,  ...,  2.2653, -0.1758, -0.6733],
         [-3.2630,  1.2945, -0.8190,  ...,  1.6167, -2.8213, -3.6856],
         ...,
         [-0.4991,  1.5252, -0.2410,  ...,  1.0018, -3.3789, -0.2315],
         [-3.0342,  2.1589,  0.8684,  ...,  1.8011, -3.3787,  1.5811],
         [-0.8713,  0.3466,  0.9158,  ...,  0.5766, -3.3744, -0.9730]]],
       grad_fn=<AddBackward0>)

In [42]:
model

Dinov2Model(
  (embeddings): Dinov2Embeddings(
    (patch_embeddings): Dinov2PatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): Dinov2Encoder(
    (layer): ModuleList(
      (0-11): 12 x Dinov2Layer(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attention): Dinov2Attention(
          (attention): Dinov2SelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): Dinov2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (layer_scale1): Dinov2LayerScale()
        (drop_path): Ide

In [2]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [4]:
outputs.last_hidden_state.shape

torch.Size([1, 257, 768])

In [5]:
type(model)

transformers.models.dinov2.modeling_dinov2.Dinov2Model

In [6]:
model.config.hidden_size

768

In [9]:
outputs.last_hidden_state.shape

torch.Size([1, 257, 768])