In [None]:
# Copyright (c) 2026, ETH Zurich, Manthan Patel
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

import torch
import os
import sys
from pathlib import Path

# Add the project root (defm) to sys.path
root_dir = Path(os.getcwd()).parent.parent.resolve() 
if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

from defm.utils import preprocess_depth_image

%load_ext autoreload
%autoreload 2

model_list = ["defm_efficientnet_b0",
              "defm_efficientnet_b2",
              "defm_efficientnet_b4",
              "defm_efficientnet_b6",
              "defm_resnet18",
              "defm_resnet34",
              "defm_resnet50",
              "defm_regnet_y_400mf",
              "defm_regnet_y_800mf",
              "defm_regnet_y_1_6gf"] # Available DeFM Conv models

MODEL_NAME = "defm_resnet50" 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = torch.hub.load(
        repo_or_dir=root_dir, # Path to your DeFM root folder
        model=MODEL_NAME,
        source='local',
        pretrained=True,
    )
model.eval().to(DEVICE)
print(f"Loaded model: {MODEL_NAME} with {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters.")

In [None]:
# Forward Inference Example with Dummy Data
dummy_depth = torch.randn(160, 192, 1) * 100 # Dummy depth input with max depth 100 meters

# Target Size must be a multiple of the patch size
# If target size is None, no resizing is applied
# For BiFPN, H and W must be multiples of 32
# The passed depth image should be in meters
# This is very important for correct metric-depth based normalization

normalized_depth = preprocess_depth_image(
    dummy_depth,
    target_size=(160, 192),
).to(DEVICE)

with torch.no_grad():
    output = model(normalized_depth)

class_token = output['global_backbone'] # [B, C]
# Here always use the P4 (H//16, W//16) feature map from BiFPN for spatial tokens as 
# this was used during distillations to match the ViT spatial tokens
spatial_tokens = output['dense_bifpn']['P4'] # [B, C, H', W']

print(f"Output Spatial Tokens: {spatial_tokens.shape}") # (B, C, H', W')
print(f"Output Class Token: {class_token.shape}") # (B, C)



In [None]:
# If you dont want to use BiFPN features, use the following instead:
# Here the input can have any size 
with torch.no_grad():
    output = model.forward_no_bifpn(normalized_depth)

class_token = output['global_backbone'] # [B, C]
spatial_tokens = output['dense_feats']['P4'] # [B, C, H//16, W//16]
dense_feat_map = output['dense_feats']['P5'] # [B, C, H//32, W//32]

print(f"Output Spatial Tokens: {spatial_tokens.shape}") # (B, C, H', W')
print(f"Output Dense Feature Map: {dense_feat_map.shape}") # (B, C, H', W')
print(f"Output Class Token: {class_token.shape}") # (B, C)