In [None]:
!pip install -qU diffusers transformers accelerate

# Stable Video Diffusion

**Stable Video Diffusion (SVD)** is an image-to-video generation model that can generate 2-4 second high resotlution (576x1024) videos conditioned on an input image.

There are two variants of this model, `SVD` and `SVD-XT`. The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.

In [None]:
import torch
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video

pipe = StableVideoDiffusionPipeline.from_pretrained(
    'stabilityai/stable-video-diffusion-img2vid-xt',
    torch_dtype=torch.float16,
    variant='fp16'
)
pipe.enable_model_cpu_offload()

In [None]:
# Load the conditioning image
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
image = image.resize((1024, 576))
generator = torch.manual_seed(111)

frames = pipe(
    image,
    decode_chunk_size=8,
    generator=generator
).frames[0]

export_to_video(frames, 'generated.mp4', fps=7)

## `torch.compile`

We can gain a 20-25% speedup at the expense of slightly increased memory by compiling the UNet

In [None]:
pipe = StableVideoDiffusionPipeline.from_pretrained(
    'stabilityai/stable-video-diffusion-img2vid-xt',
    torch_dtype=torch.float16,
    variant='fp16'
)
#pipe.enable_model_cpu_offload()
pipe.to('cuda')
pipe.unet = torch.compile(
    pipe.unet,
    mode='reduce-overhead',
    fullgraph=True
)

In [None]:
# Load the conditioning image
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
image = image.resize((1024, 576))
generator = torch.manual_seed(111)

frames = pipe(
    image,
    decode_chunk_size=8,
    generator=generator
).frames[0]

export_to_video(frames, 'generated.mp4', fps=7)

## Reduce memory usage

Video generation is very memory intensive because we are generating `num_frames` all at once, similar to text-to-image generation with a high batch size.

To reduce the memory requirement, we can
* enable model offloading: each component of the pipeline is offloaded to the CPU once it is not needed anymore.
* enable feed-forward chunking: the feed-forward layer runs in a loop instead of running a single feed-forward with a huge batch size.
* reduce `decode_chunk_size`: the VAE decodes frames in chunks instead of decoding them all together. Setting `decode_chunk_size=1` decodes one frame at a time and uses the least amount of memory

In [None]:
pipe = StableVideoDiffusionPipeline.from_pretrained(
    'stabilityai/stable-video-diffusion-img2vid-xt',
    torch_dtype=torch.float16,
    variant='fp16'
)
pipe.enable_model_cpu_offload()
pipe.unet.enable_forward_chunking()

In [None]:
# Load the conditioning image
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
image = image.resize((1024, 576))
generator = torch.manual_seed(111)

frames = pipe(
    image,
    decode_chunk_size=2,
    num_frames=25,
    generator=generator
).frames[0]

export_to_video(frames, 'generated.mp4', fps=7)

## Micro-conditioning

* `fps`: the frames per second of the generated video
* `motion_bucket_id`: the motion bucket id to use for the generated video. This can be used to control the motion of the generated video. Increasing the motion bucket id increases the motion of the generated video.
* `noise_aug_strength`: the amount of noise added to the conditioning image. The higher the values the less the video resembles the conditioning image. Increasing this value also increases the motion of the generated video.

In [None]:
import torch
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video

pipe = StableVideoDiffusionPipeline.from_pretrained(
  "stabilityai/stable-video-diffusion-img2vid-xt",
  torch_dtype=torch.float16,
  variant="fp16"
)
pipe.enable_model_cpu_offload()

In [None]:
# Load the conditioning image
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
image = image.resize((1024, 576))
generator = torch.manual_seed(42)

frames = pipe(
    image,
    decode_chunk_size=8,
    generator=generator,
    motion_bucket_id=180,
    noise_aug_strength=0.1
).frames[0]
export_to_video(frames, "generated.mp4", fps=7)