In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from torchvision import transforms

In [None]:
def load_depth_as_rgb(path):
    img = Image.open(path)

    if img.mode != "RGB":
        img = img.convert("RGB")

    return img

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

## DPT (MiDaS) Encoder Embeddings

In [None]:
import torch.hub

midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large").to(device)
midas.eval()
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
dpt_transform = midas_transforms.dpt_transform

Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


## Depth-MAE Encoder Embeddings

In [None]:
from transformers import ViTMAEModel

mae_model = ViTMAEModel.from_pretrained("facebook/vit-mae-base").to(device)
mae_model.eval()
mae_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5,0.5,0.5],[0.5,0.5,0.5])
])

## DepthAnythingV2 Encoder Embeddings

In [None]:
from transformers import pipeline

da_pipe = pipeline(
    task="depth-estimation",
    model="depth-anything/Depth-Anything-V2-Small-hf",
    device=0 if torch.cuda.is_available() else -1
)

Device set to use cpu


### Extract Embeddings

In [None]:
def load_depth_as_rgb(path):
    img = Image.open(path)
    if img.mode != "RGB":
        img = img.convert("RGB")
    return img

@torch.no_grad()
def extract_dpt_embedding(img: Image.Image):
    """
    Extract a fixed-size embedding from DPT-Large MiDaS
    """
    # 1. Convert PIL -> numpy, normalize
    img_np = np.array(img).astype(np.float32) / 255.0  # HWC, float32
    # HWC -> CHW
    img_tensor = torch.from_numpy(img_np).permute(2,0,1).unsqueeze(0).to(device)  # [1,3,H,W]

    # 2. Forward through model
    out = midas(img_tensor)  # shape [1,H,W] for depth map

    # 3. Global pooling to get fixed-size vector
    pooled = torch.mean(out, dim=[1,2])  # mean over H,W -> shape [B]

    return pooled.squeeze(0).cpu().numpy()

@torch.no_grad()
def extract_mae_embedding(img: Image.Image):
    x = mae_transform(img).unsqueeze(0).to(device)
    out = mae_model(pixel_values=x)
    cls = out.last_hidden_state[:,0]
    return cls.squeeze(0).cpu().numpy()

from sklearn.preprocessing import normalize

@torch.no_grad()
def extract_depthanything_embedding(img: Image.Image, target_size=256):
    """
    Use Hugging Face DepthAnythingV2 pipeline to get a fixed-length depth embedding
    """
    depth_map = da_pipe(img)["depth"]  # may be PIL.Image

    # Convert to numpy float32
    if isinstance(depth_map, Image.Image):
        depth_map = np.array(depth_map).astype(np.float32)

    # Resize to fixed size
    depth_map = cv2.resize(depth_map, (target_size, target_size))

    # Normalize
    depth_map = (depth_map - depth_map.mean()) / (depth_map.std() + 1e-8)

    # Flatten
    depth_emb = depth_map.flatten()  # length = target_size*target_size

    # Reduce to 512-dim via simple averaging
    depth_emb = depth_emb.reshape(512, -1).mean(axis=1)

    # L2 normalize
    depth_emb = normalize(depth_emb.reshape(1, -1))[0]

    return depth_emb

In [None]:
import cv2

df = pd.read_csv("/content/drive/MyDrive/NNDL Project/clip_embeddings_sunrgbd_with_labels.csv")

dpt_embs = []
mae_embs = []
da_embs = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    depth_img = load_depth_as_rgb(row["depthmap"])
    dpt_embs.append(extract_dpt_embedding(depth_img))
    mae_embs.append(extract_mae_embedding(depth_img))
    da_embs.append(extract_depthanything_embedding(depth_img))


df_out = df.copy()

df_out["dpt_embedding"] = dpt_embs
df_out["mae_embedding"] = mae_embs
df_out["depthanythingv2_embedding"] = da_embs
df_out.to_csv("/content/drive/MyDrive/NNDL Project/sunrgbd_depth_embeddings.csv", index=False)

 16%|█▌        | 219/1385 [2:13:37<11:56:28, 36.87s/it]