#### DeFM: Semantic Awareness via PCA Visualization

This notebook demonstrates DeFM's ability to extract consistent semantic representations from purely geometric depth data.

üõ†Ô∏è Example Scenarios
1. Cross-Sensor Stability (Cups): Analyzes household objects captured via Active Stereo (D-435), LiDAR (L-515), and Neural Stereo (ZED X, ZED 2i).

2. Robotics in the Wild (Ladder): Depth images from a Realsense D-435 mounted on an ANYmal-D Quadrupedal robot during ladder climbing tasks.

3. Monocular Depth Estimator (Drawers): Depth Images generated from RGB images using Depth Anything V2 (DAv2). Notice the consistent features for the drawer handles.

In [None]:
# Copyright (c) 2026, ETH Zurich, Manthan Patel
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

import sys
import torch
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
import torchvision.transforms as tt
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale

%load_ext autoreload
%autoreload 2

# Add the project root (defm) to sys.path
root_dir = Path(os.getcwd()).parent.resolve() 
if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))
    
from defm.utils import preprocess_depth_dav2, preprocess_depth_image

# Config
MODEL_NAME = "defm_vit_l14"  # Only ViT models are supported currently
PATCH_SIZE = 14          # ViT patch size
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Cup Examples
DEPTH_INPUT = "metric" # "metric" or "dav2"
H, W = 518, 518      # Evaluation resolution
BACKGROUND_THRESHOLD = 0.5
INPUT_PATH = root_dir / "example_images/cups"
DEPTH_MULTIPLIER = 100.0  # Scale saved depth image to convert to meters

# 2. Ladder Examples (Realsense D-435 Depth)
# DEPTH_INPUT = "metric" # "metric" or "dav2"
# H, W = 518, 518      # Evaluation resolution
# BACKGROUND_THRESHOLD =  1.0 # Dont remove any background
# INPUT_PATH = root_dir / "example_images/ladders"
# DEPTH_MULTIPLIER = 100.0  # Scale saved depth image to convert to meters

# 3. Drawer Examples which use Monocular Depth Estimator Depth (MDE)
# DEPTH_INPUT = "dav2" # "metric" or "dav2"
# H, W = 700, 700      # Evaluation resolution
# BACKGROUND_THRESHOLD = 1.0 # Dont remove any background
# INPUT_PATH = root_dir / "example_images/drawers"
# DEPTH_MULTIPLIER = 1.0  # Scale saved depth image to convert to meters

In [None]:
def get_model(model_name=MODEL_NAME):
    """Loads DeFM backbone via TorchHub from local source."""
    model = torch.hub.load(
        repo_or_dir='../', # Adjust path to your root DeFM directory
        model=model_name,
        source='local',
        pretrained=True
    )
    model.to(DEVICE)
    model.eval()
    print(f"‚úÖ Loaded {model_name} to {DEVICE}")
    return model

model = get_model()
device = DEVICE

In [None]:
def preprocess_image(image_path, target_size=(H, W)):
    
    # Read image
    img_np = cv2.imread(image_path, cv2.IMREAD_UNCHANGED).astype(np.float32)
    img_np = img_np / DEPTH_MULTIPLIER

    if DEPTH_INPUT == "metric":
        depth_tensor = preprocess_depth_image(img_np, target_size=target_size, patch_size=PATCH_SIZE)
    elif DEPTH_INPUT == "dav2":
        depth_tensor = preprocess_depth_dav2(img_np, target_size=target_size, patch_size=PATCH_SIZE)
    else:   
        raise ValueError("DEPTH_INPUT must be either 'metric' or 'dav2'")
    
    return depth_tensor.to(DEVICE)

In [None]:
def extract_patch_tokens(model, input_tensor, device):
    input_tensor = input_tensor.to(device)
    with torch.no_grad():
        feats = model.get_intermediate_layers(input_tensor, n=1, return_class_token=False, norm=True)
        patch_tokens = feats[0].cpu().numpy()  # [1, N, D]
    return patch_tokens

def run_pca_viz(input_folder, n_components=3):
    image_paths = sorted([str(p) for p in Path(input_folder).glob("*.png")])
    all_tokens = []
    raw_images = []

    # First pass: extract patch tokens per image
    for img_path in image_paths:
        input_tensor = preprocess_image(img_path)
        patch_tokens = extract_patch_tokens(model, input_tensor, device)  # [1,N,D]
        patch_tokens = patch_tokens.squeeze(0)  # [N,D]
        all_tokens.append(patch_tokens)
        raw_images.append(np.array(cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB))[:,:,0])

    all_patch_tokens = np.stack(all_tokens)  # [num_images, N, D]
    num_images, num_patches, feat_size = all_patch_tokens.shape

    # Fit per-image PCA for foreground masks
    fg_pca = PCA(n_components=1)
    foreground_masks = []
    all_patches_flat = all_patch_tokens.reshape(-1, feat_size)
    reduced_patches = fg_pca.fit_transform(all_patches_flat)
    norm_patches = minmax_scale(reduced_patches).reshape(num_images, num_patches)

    # Generate binary masks per image
    for i in range(num_images):
        mask = norm_patches[i] < BACKGROUND_THRESHOLD # Threshold for foreground
        foreground_masks.append(mask)

    # Extract only foreground patches across all images
    fg_patches = np.vstack([
        all_patch_tokens[i][foreground_masks[i]]
        for i in range(num_images)
    ])

    print(f"Total foreground patches for global PCA: {fg_patches.shape}")

    # Fit global PCA to foreground patches
    object_pca = PCA(n_components=n_components)
    reduced_fg_patches = object_pca.fit_transform(fg_patches)
    reduced_fg_patches = minmax_scale(reduced_fg_patches)

    print("Explained variance ratio:", object_pca.explained_variance_ratio_)

    # Prepare index slicing for each image
    mask_indices = np.cumsum([0] + [np.sum(m) for m in foreground_masks])

    # Visualization per image
    num_cols = 4  # 2 samples per row, each with PCA + original
    num_rows = int(np.ceil(num_images / 2))

    plt.figure(figsize=(num_cols * 4, num_rows * 4))

    for i in range(num_images):
        patch_image = np.zeros((num_patches, n_components), dtype='float32')
        patch_image[foreground_masks[i], :] = reduced_fg_patches[mask_indices[i]:mask_indices[i+1], :]

        # Reshape to patch grid (H//PATCH_SIZE, W//PATCH_SIZE, n_components)
        color_patches = patch_image.reshape([H // PATCH_SIZE, W // PATCH_SIZE, n_components])

        # Compute row/col index for combined figure
        row_idx = i // 2
        col_offset = (i % 2) * 2  # 0 or 2

        # ----- PCA Visualization -----
        plt.subplot(num_rows, num_cols, (row_idx * num_cols) + col_offset + 1)
        plt.imshow(color_patches)
        plt.axis('off')

        # ----- Original Image -----
        plt.subplot(num_rows, num_cols, (row_idx * num_cols) + col_offset + 2)
        image = cv2.resize(raw_images[i], (W, H))
        plt.imshow(image, cmap='turbo_r', vmin=0, vmax=255)
        plt.axis('off')

    # Adjust layout and show combined figure
    plt.subplots_adjust(wspace=0, hspace=0)
    plt.tight_layout(pad=0)
    plt.show()

# Run the pipeline
run_pca_viz(INPUT_PATH)