## Cosmos Transfer AV Sample Inference

Demonstration notebook for running Cosmos Transfer inference with tokenizer and checkpoint loading.

In [None]:

import os, sys

# Resolve repository root and ensure modules are importable
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..')) if os.path.basename(os.getcwd()) == 'examples' else os.getcwd()

sys.path.append(ROOT_DIR)

# Inference options (edit as needed)
PROMPT = "The video is captured from a camera mounted on a car. The camera is facing forward."
CHECKPOINT_DIR = os.path.join(ROOT_DIR, 'checkpoints')
VIDEO_SAVE_NAME = 'test1'
VIDEO_SAVE_FOLDER = os.path.join(ROOT_DIR, 'outputs', 'sample_av')
CONTROLNET_SPECS = os.path.join(ROOT_DIR, 'assets', 'sample_av_hdmap_spec.json')
SIGMA_MAX = 80
OFFLOAD_TEXT_ENCODER_MODEL = True
IS_AV_SAMPLE = True
NUM_GPUS = 1


In [None]:

from argparse import Namespace
from cosmos_transfer1.diffusion.inference.inference_utils import load_controlnet_specs, validate_controlnet_specs
from cosmos_transfer1.diffusion.inference.preprocessors import Preprocessors

# Build configuration namespace similar to CLI arguments
cfg = Namespace(
    prompt=PROMPT,
    negative_prompt=(
        "The video captures a game playing, with bad crappy graphics and cartoonish frames. "
        "It represents a recording of old outdated games. The lighting looks very fake. "
        "The textures are very raw and basic. The geometries are very primitive. "
        "The images are very pixelated and of poor CG quality. There are many subtitles "
        "in the footage. Overall, the video is unrealistic at all."
    ),
    input_video_path='',
    num_input_frames=1,
    sigma_max=SIGMA_MAX,
    blur_strength='medium',
    canny_threshold='medium',
    controlnet_specs=CONTROLNET_SPECS,
    is_av_sample=IS_AV_SAMPLE,
    checkpoint_dir=CHECKPOINT_DIR,
    tokenizer_dir='Cosmos-Tokenize1-CV8x8x8-720p',
    video_save_name=VIDEO_SAVE_NAME,
    video_save_folder=VIDEO_SAVE_FOLDER,
    batch_input_path=None,
    batch_size=1,
    num_steps=35,
    guidance=5,
    fps=24,
    seed=1,
    num_gpus=NUM_GPUS,
    offload_diffusion_transformer=False,
    offload_text_encoder_model=OFFLOAD_TEXT_ENCODER_MODEL,
    offload_guardrail_models=False,
    upsample_prompt=False,
    offload_prompt_upsampler=False,
    use_distilled=False,
)

# Load and validate controlnet specs
control_inputs, json_args = load_controlnet_specs(cfg)
for key, val in json_args.items():
    setattr(cfg, key, val)
control_inputs = validate_controlnet_specs(cfg, control_inputs)

# Preprocess input control if needed
preprocessors = Preprocessors()
control_inputs = preprocessors(cfg.input_video_path, cfg.prompt, control_inputs, cfg.video_save_folder)


In [None]:

import os
from cosmos_transfer1.checkpoints import BASE_7B_CHECKPOINT_AV_SAMPLE_PATH, BASE_7B_CHECKPOINT_PATH
from cosmos_transfer1.diffusion.inference.world_generation_pipeline import DiffusionControl2WorldGenerationPipeline
from cosmos_transfer1.utils.io import save_video

checkpoint_name = BASE_7B_CHECKPOINT_AV_SAMPLE_PATH if cfg.is_av_sample else BASE_7B_CHECKPOINT_PATH

# Initialize pipeline (loads weights)
pipeline = DiffusionControl2WorldGenerationPipeline(
    checkpoint_dir=cfg.checkpoint_dir,
    checkpoint_name=checkpoint_name,
    offload_network=cfg.offload_diffusion_transformer,
    offload_text_encoder_model=cfg.offload_text_encoder_model,
    offload_guardrail_models=cfg.offload_guardrail_models,
    guidance=cfg.guidance,
    num_steps=cfg.num_steps,
    fps=cfg.fps,
    seed=cfg.seed,
    num_input_frames=cfg.num_input_frames,
    control_inputs=control_inputs,
    sigma_max=cfg.sigma_max,
    blur_strength=cfg.blur_strength,
    canny_threshold=cfg.canny_threshold,
    upsample_prompt=cfg.upsample_prompt,
    offload_prompt_upsampler=cfg.offload_prompt_upsampler,
)

# Explicitly load tokenizer weights
pipeline._load_tokenizer()

# Generate video
os.makedirs(cfg.video_save_folder, exist_ok=True)
outputs = pipeline.generate(
    prompt=[cfg.prompt],
    video_path=[cfg.input_video_path] if cfg.input_video_path else [None],
    negative_prompt=cfg.negative_prompt,
    control_inputs=[control_inputs],
    save_folder=cfg.video_save_folder,
    batch_size=1,
)

if outputs is not None:
    videos, final_prompts = outputs
    video_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.mp4")
    prompt_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.txt")
    save_video(
        video=videos[0],
        fps=cfg.fps,
        H=videos[0].shape[1],
        W=videos[0].shape[2],
        video_save_quality=5,
        video_save_path=video_path,
    )
    with open(prompt_path, 'wb') as f:
        f.write(final_prompts[0].encode('utf-8'))
    print(f"Saved video to {video_path}
Saved prompt to {prompt_path}")
else:
    print('Generation was blocked by guardrails.')
