In [2]:
import torch

In [18]:
from PIL import Image
import torchvision.transforms as T
transform = T.Compose([
    T.Resize((224, 224)),  # Resize the image to the model's input size
    T.ToTensor(),  # Convert the image to a tensor
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalize the image
])
image = Image.open('crane_test.jpg')
image = transform(image)
image = image.unsqueeze(0)  # Add a batch dimension
dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
with torch.no_grad():
  dino_features = dinov2_vitl14.forward_features(image)["x_norm_patchtokens"].squeeze(0)

Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main


In [26]:
def dino_slam(input, hidden, output):
    model = torch.nn.Sequential(
    torch.nn.Linear(input, hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden, hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden, hidden),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden, output)
    )
    return model

model = dino_slam(1024, 100, 6)
model.load_state_dict(torch.load('dino_slam_model.pt'))

<All keys matched successfully>

In [24]:
dino_features.shape

torch.Size([256, 1024])

In [27]:
model.eval()

Sequential(
  (0): Linear(in_features=1024, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=100, bias=True)
  (3): ReLU()
  (4): Linear(in_features=100, out_features=100, bias=True)
  (5): ReLU()
  (6): Linear(in_features=100, out_features=6, bias=True)
)

In [28]:
keypoints = model(dino_features)
keypoints

tensor([[1.6064e+02, 7.9379e+01, 2.0886e+01, 8.9783e+01, 1.7287e+01, 1.5565e-01],
        [1.1836e+02, 5.1105e+01, 1.2065e+01, 5.6376e+01, 1.1869e+01, 2.0111e-01],
        [2.4298e+02, 8.4455e+01, 1.5581e+01, 9.8944e+01, 2.3830e+01, 5.5295e-01],
        ...,
        [1.9011e+02, 7.5321e+01, 1.0050e+01, 6.9472e+01, 1.5517e+01, 3.5352e-01],
        [1.6630e+02, 1.0134e+02, 1.5627e+01, 6.4379e+01, 8.2572e+00, 9.7412e-02],
        [1.6205e+02, 6.8918e+01, 1.4144e+01, 7.2208e+01, 1.5411e+01, 2.1284e-01]],
       grad_fn=<AddmmBackward0>)

In [29]:
keypoints.shape

torch.Size([256, 6])

In [32]:
import numpy as np
keypoints_np = keypoints.detach().numpy()
print(keypoints_np)

[[1.60637878e+02 7.93786316e+01 2.08859787e+01 8.97832413e+01
  1.72869511e+01 1.55647278e-01]
 [1.18363403e+02 5.11047554e+01 1.20650835e+01 5.63755074e+01
  1.18691034e+01 2.01109886e-01]
 [2.42979462e+02 8.44547653e+01 1.55811853e+01 9.89440308e+01
  2.38297596e+01 5.52954435e-01]
 ...
 [1.90113480e+02 7.53214569e+01 1.00499611e+01 6.94719543e+01
  1.55171576e+01 3.53523016e-01]
 [1.66303589e+02 1.01342545e+02 1.56271372e+01 6.43793335e+01
  8.25717258e+00 9.74123403e-02]
 [1.62051865e+02 6.89180832e+01 1.41444578e+01 7.22084732e+01
  1.54105597e+01 2.12843180e-01]]


In [33]:
np.savetxt('keypoints.txt', keypoints_np, delimiter=',')