In [None]:
git clone https://github.com/facebookresearch/sam3.git
cd sam3
pip install -e .

In [None]:
# For running example notebooks
pip install -e ".[notebooks]"

# For development
pip install -e ".[train,dev]"

In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
load_dotenv()

token = os.getenv("HF_TOKEN")
login(token=token)


# Images

In [None]:
import torch
import requests
import numpy as np
from PIL import Image
from io import BytesIO
from PIL import Image
from sam3.model_builder import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor

In [None]:
URL = "https://1.bp.blogspot.com/-1OiYytlrNoc/VQ6m0zn00CI/AAAAAAAAi_M/2Nc_D36ztVs/s1600/image028.jpg"
response = requests.get(URL)
image = Image.open(BytesIO(response.content))
image.show()

In [None]:
model = build_sam3_image_model()
processor = Sam3Processor(model)
inference_state = processor.set_image(image)
output = processor.set_text_prompt(state=inference_state, prompt="<YOUR_TEXT_PROMPT>")
masks, boxes, scores = output["masks"], output["boxes"], output["scores"]

# Video

In [None]:
from transformers import Sam3VideoModel, Sam3VideoProcessor
from accelerate import Accelerator
import torch

device = Accelerator().device
model = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
processor = Sam3VideoProcessor.from_pretrained("facebook/sam3")

# Load video frames
from transformers.video_utils import load_video
video_url = "https://huggingface.co/datasets/hf-internal-testing/sam2-fixtures/resolve/main/bedroom.mp4"
video_frames, _ = load_video(video_url)

# Initialize video inference session
inference_session = processor.init_video_session(
    video=video_frames,
    inference_device=device,
    processing_device="cpu",
    video_storage_device="cpu",
    dtype=torch.bfloat16,
)

# Add text prompt to detect and track objects
text = "person"
inference_session = processor.add_text_prompt(
    inference_session=inference_session,
    text=text,
)

# Process all frames in the video
outputs_per_frame = {}
for model_outputs in model.propagate_in_video_iterator(
    inference_session=inference_session, max_frame_num_to_track=50
):
    processed_outputs = processor.postprocess_outputs(inference_session, model_outputs)
    outputs_per_frame[model_outputs.frame_idx] = processed_outputs

print(f"Processed {len(outputs_per_frame)} frames")

# Access results for a specific frame
frame_0_outputs = outputs_per_frame[0]
print(f"Detected {len(frame_0_outputs['object_ids'])} objects")
print(f"Object IDs: {frame_0_outputs['object_ids'].tolist()}")
print(f"Scores: {frame_0_outputs['scores'].tolist()}")
print(f"Boxes shape (XYXY format, absolute coordinates): {frame_0_outputs['boxes'].shape}")
print(f"Masks shape: {frame_0_outputs['masks'].shape}")
