# Video Depth Estimation with Depth Anything V3
Sample every Nth frame from a video, run depth estimation, and save both RGB and depth images.

In [1]:
import sys
import os

# Add Depth-Anything-3 src to Python path
DA3_SRC = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'Depth-Anything-3', 'src'))
sys.path.insert(0, DA3_SRC)

# Repo root (two levels up from analysis/tutorials/)
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from depth_anything_3.api import DepthAnything3

[93m[WARN ] Dependency `gsplat` is required for rendering 3DGS. Install via: pip install git+https://github.com/nerfstudio-project/gsplat.git@0b4dddf04cb687367602c01196913cde6a743d70[0m


In [2]:
def load_model(model_name="depth-anything/DA3-SMALL"):
    """Load a Depth Anything V3 model onto GPU (or CPU fallback)."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Loading model: {model_name} on {device} ...")
    model = DepthAnything3.from_pretrained(model_name).to(device=device)
    return model

In [3]:
def extract_frames(video_path, sample_rate=30):
    """Extract every Nth frame from a video. Returns a list of (frame_index, BGR numpy array)."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise FileNotFoundError(f"Cannot open video: {video_path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    print(f"Video: {total_frames} frames, {fps:.1f} fps, sampling every {sample_rate} frames")

    frames = []
    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if idx % sample_rate == 0:
            frames.append((idx, frame))
        idx += 1

    cap.release()
    print(f"Extracted {len(frames)} frames")
    return frames

In [4]:
def process_video(model, video_path, output_dir, sample_rate=30, cmap="inferno"):
    """
    Full pipeline: extract frames, run depth, save RGB + depth images.
    
    Output structure:
        <output_dir>/<video_name>-rgb/frame_000001.png, ...
        <output_dir>/<video_name>-d/frame_000001.png, ...
    """
    video_name = Path(video_path).stem

    rgb_dir = os.path.join(output_dir, f"{video_name}-rgb")
    depth_dir = os.path.join(output_dir, f"{video_name}-d")
    os.makedirs(rgb_dir, exist_ok=True)
    os.makedirs(depth_dir, exist_ok=True)

    frames = extract_frames(video_path, sample_rate)

    for i, (frame_idx, bgr_frame) in enumerate(frames):
        fname = f"frame_{i+1:06d}.png"

        # Save RGB frame (convert BGR -> RGB for saving)
        rgb_frame = cv2.cvtColor(bgr_frame, cv2.COLOR_BGR2RGB)
        rgb_path = os.path.join(rgb_dir, fname)
        plt.imsave(rgb_path, rgb_frame)

        # Run depth estimation (DA3 accepts numpy RGB arrays)
        prediction = model.inference([rgb_frame])
        depth = prediction.depth[0]  # [H, W] float32

        # Save depth map
        depth_path = os.path.join(depth_dir, fname)
        plt.imsave(depth_path, depth, cmap=cmap)

        print(f"  [{i+1}/{len(frames)}] video frame {frame_idx} -> {fname}")

    print(f"\nDone! RGB: {rgb_dir}  |  Depth: {depth_dir}")

# Run Video Depth Estimation
Set your input video, output directory, and sample rate below.

In [5]:
# ---- Hyperparameters (paths relative to repo root) ----
input_video_path = "analysis/data/depth/surgery_video.mp4"
output_dir       = "analysis/outputs/depth/"
sample_rate      = 30     # sample every Nth frame
model_name       = "depth-anything/DA3-SMALL"
# --------------------------------------------------------

input_video_path = os.path.join(ROOT, input_video_path)
output_dir       = os.path.join(ROOT, output_dir)

model = load_model(model_name)
process_video(model, input_video_path, output_dir, sample_rate)

Loading model: depth-anything/DA3-SMALL on cuda ...
[97m[INFO ] using MLP layer as FFN[0m
Video: 1638 frames, 24.0 fps, sampling every 30 frames
Extracted 55 frames
[97m[INFO ] Processed Images Done taking 0.026705503463745117 seconds. Shape:  torch.Size([1, 3, 280, 504])[0m
[97m[INFO ] Model Forward Pass Done. Time: 0.46265339851379395 seconds[0m
[97m[INFO ] Conversion to Prediction Done. Time: 0.0011136531829833984 seconds[0m
  [1/55] video frame 0 -> frame_000001.png
[97m[INFO ] Processed Images Done taking 0.026996850967407227 seconds. Shape:  torch.Size([1, 3, 280, 504])[0m
[97m[INFO ] Model Forward Pass Done. Time: 0.11876988410949707 seconds[0m
[97m[INFO ] Conversion to Prediction Done. Time: 0.0004761219024658203 seconds[0m
  [2/55] video frame 30 -> frame_000002.png
[97m[INFO ] Processed Images Done taking 0.03297281265258789 seconds. Shape:  torch.Size([1, 3, 280, 504])[0m
[97m[INFO ] Model Forward Pass Done. Time: 0.0325932502746582 seconds[0m
[97m[INFO ] 