In [None]:
!pip install -qU diffusers accelerate transformers huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Text or image-to-video

## Popular models

### CogVideoX

CogVideoX uses a 3D Variational Autoencoder (VAE) to compress videos along the spatial and temporal dimensions.

In [None]:
import torch
from diffusers import CogVideoXImageToVideoPipeline
from diffusers.utils import export_to_video, load_image

pipe = CogVideoXImageToVideoPipeline.from_pretrained(
    'THUDM/CogVideoX-5b-I2V',
    torch_dtype=torch.float16,
)

pipe.vae.enable_tiling()
pipe.vae.enable_slicing()

In [None]:
prompt = "A vast, shimmering ocean flows gracefully under a twilight sky, its waves undulating in a mesmerizing dance of blues and greens. The surface glints with the last rays of the setting sun, casting golden highlights that ripple across the water. Seagulls soar above, their cries blending with the gentle roar of the waves. The horizon stretches infinitely, where the ocean meets the sky in a seamless blend of hues. Close-ups reveal the intricate patterns of the waves, capturing the fluidity and dynamic beauty of the sea in motion."
image = load_image(image="cogvideox_rocket.png")

video = pipe(
    prompt,
    image=image,
    num_videos_per_prompt=1,
    num_inference_steps=50,
    num_frames=49,
    guidance_scale=6,
    generator=torch.Generator('cuda').manual_seed(111),
).frames[0]

export_to_video(video, 'output.mp4', fps=8)

### Stable Video Diffusion

SVD is based on the Stable Diffusion 2.1 model and it is trained on images, then low-resolution videos, and finally a smaller dataset of high-resolution videos. This model genrates a short 2-4 second video from an initial image.

In [None]:
import torch
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video

pipeline = StableVideoDiffusionPipeline.from_pretrained(
    'stabilityai/stable-video-diffusion-img2vid-xt',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()

In [None]:
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
image = image.resize((1024, 576))

generator = torch.manual_seed(111)

frames = pipeline(
    image,
    decode_chunk_size=8,
    generator=generator,
).frames[0]
export_to_video(frames, 'generated.mp4', fps=7)

### I2VGen-XL

I2VGen-XL is a diffusion model that can generate higher resolution videos than SVD and it is also capable of accepting text prompts in addition to images. The model is trained with two hierarchical encoders (detail encoder and global encoder) to better capture low and high-level details in images. These learned details are used to train a video diffusion model which refines the video resolution and details in the generatred video.

In [None]:
import torch
from diffusers import I2VGenXLPipeline
from diffusers.utils import export_to_gif, load_image

pipeline = I2VGenXLPipeline.from_pretrained(
    'ali-vilab/i2vgen-xl',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()

In [None]:
image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
image = load_image(image_url).convert('RGB')

prompt = 'Papers were floating in the air on a table in the library'
negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
generator = torch.manual_seed(111)

frames = pipeline(
    prompt,
    negative_prompt=negative_prompt,
    image=image,
    num_inference_steps=50,
    guidance_scale=9.0,
    generator=generator,
).frames[0]
export_to_gif(frames, 'i2v.gif')

### AnimateDiff

AnimateDiff is an adapter model that inserts a motion module into a pretrained diffusion model to animate an image. The adapter is trained on video clips to learn motion which is used to condition the generation process to create a video. It is faster and easier to only train the adapter and it can be loaded into most diffusion models, effectively turning them into "video models".

In [None]:
# load a `MotionAdapter` first
import torch
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
from diffusers.utils import export_to_gif

adapter = MotionAdapter.from_pretrained(
    'guoyww/animatediff-motion-adapter-v1-5-2',
    torch_dtype=torch.float16,
)

In [None]:
# load a finetuned SD with AnimateDiffPipeline
pipeline = AnimateDiffPipeline.from_pretrained(
    'emilianJR/epiCRealism',
    motion_adapter=adapter,
    torch_dtype=torch.float16,
)

scheduler = DDIMScheduler.from_pretrained(
    'emilianJR/epiCRealism',
    subfolder='scheduler',
    clip_sample=False,
    timestep_spacing='linspace',
    beta_schedule='linear',
    steps_offset=1,
)

pipeline.scheduler = scheduler
pipeline.enable_vae_slicing()
pipeline.enable_model_cpu_offload()

In [None]:
output = pipeline(
    prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
    negative_prompt="bad quality, worse quality, low resolution",
    num_frames=16,
    guidance_scale=7.5,
    num_inference_steps=50,
    generator=torch.Generator('cpu').manual_seed(111),
)
frames = output.frames[0]
export_to_gif(frames, 'animation.gif')

### ModelscopeT2V

ModelscopeT2V adds spatial and temporal convolutions and attention to a UNet, and it is trained on image-text and video-text datasets to enhance what it learns during training. The model takes a prompt, encodes it and creates text embeddings which are denoised by the UNet, and then decoded by a VQGAN into a video.

In [None]:
import torch
from diffusers import DiffusionPipeline
from diffusers.utils import export_to_video

pipeline = DiffusionPipeline.from_pretrained(
    'damo-vilab/text-to-video-ms-1.7b',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()
pipeline.enable_vae_slicing()

In [None]:
prompt = 'Confident teddy bear surfer rides the wave in the tropics'
video_frames = pipeline(prompt).frames[0]
export_to_video(video_frames, 'modelscopet2v.mp4', fps=10)

## Configure model parameters

### Number of frames

`num_frames` determines how many video frames are generated per second. A frame is an image that is played in a sequence of other frames to create motion or a video. This affects video length because the pipeline generates a certain number of frames per second. To increase the video duration, we need to increase the `num_frames` parameter.

In [None]:
import torch
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import export_to_video, load_image

pipeline = StableVideoDiffusionPipeline.from_pretrained(
    'stabilityai/stable-video-diffusion-img2vid',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()

In [None]:
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
image = image.resize((1024, 576))
generator = torch.manual_seed(111)

frames = pipeline(
    image,
    decode_chunk_size=8,
    generator=generator,
    num_frames=25,
).frames[0]
export_to_video(frames, 'generated.mp4', fps=7)

### Guidance scale

`guidance_scale` controls how closely aligned the generated video and text prompt or initial image is. A higher `guidance_scale` value means our generated video is more aligned with the text prompt or initial image, while a lower `guidance_scale` value means our generated video is less aligned which could give the model more "creativity" to interpret the conditioning input.

In [None]:
import torch
from diffusers import I2VGenXLPipeline
from diffusers.utils import export_to_gif, load_image

pipeline = I2VGenXLPipeline.from_pretrained(
    'ali-vilab/i2vgen-xl',
    torch_dtype=torch.float16,
    variant='fp16',
)
pipeline.enable_model_cpu_offload()

In [None]:
image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
image = load_image(image_url).convert("RGB")

prompt = "Papers were floating in the air on a table in the library"
negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
generator = torch.manual_seed(0)

frames = pipeline(
    prompt=prompt,
    image=image,
    num_inference_steps=50,
    negative_prompt=negative_prompt,
    guidance_scale=1.0,
    generator=generator
).frames[0]
export_to_gif(frames, "i2v.gif")

### Negative prompt

A negative prompt deters the model from generating things we do not want it to.

In [None]:
import torch
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
from diffusers.utils import export_to_gif

adapter = MotionAdapter.from_pretrained(
    "guoyww/animatediff-motion-adapter-v1-5-2",
    torch_dtype=torch.float16
)

pipeline = AnimateDiffPipeline.from_pretrained(
    "emilianJR/epiCRealism",
    motion_adapter=adapter,
    torch_dtype=torch.float16
)
scheduler = DDIMScheduler.from_pretrained(
    "emilianJR/epiCRealism",
    subfolder="scheduler",
    clip_sample=False,
    timestep_spacing="linspace",
    beta_schedule="linear",
    steps_offset=1,
)
pipeline.scheduler = scheduler
pipeline.enable_vae_slicing()
pipeline.enable_model_cpu_offload()

In [None]:
output = pipeline(
    prompt="360 camera shot of a sushi roll in a restaurant",
    negative_prompt="Distorted, discontinuous, ugly, blurry, low resolution, motionless, static",
    num_frames=16,
    guidance_scale=7.5,
    num_inference_steps=50,
    generator=torch.Generator("cpu").manual_seed(0),
)
frames = output.frames[0]
export_to_gif(frames, "animation.gif")

## Control video generation

### Text2Video-Zero

Text2Video-Zero video generation can be conditioned on pose and edge images for even greater control over a subject's motion in the generated video or to preserve the identity of a subject/object in the video.

##### Pose control

In [None]:
from huggingface_hub import hf_hub_download
from PIL import Image
import imageio

filename = "__assets__/poses_skeleton_gifs/dance1_corr.mp4"
repo_id = "PAIR/Text2Video-Zero"
video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)

reader = imageio.get_reader(video_path, "ffmpeg")
frame_count = 8
pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]

Load a ControlNetModel for pose estimation and a checkpoint into the `StableDiffusionControlNetPipeline`. Then we will use the `CrossFrameAttnProcessor` for the UNet and ControlNet.

In [None]:
import torch
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor

model_id = 'stable-diffusion-v1-5/stable-diffusion-v1-5'
controlnet = ControlNetModel.from_pretrained(
    'llyasviel/sd-controlnet-openpose',
    torch_dtype=torch.float16
)
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
    model_id,
    controlnet=controlnet,
    torch_dtype=torch.float16
).to('cuda')

pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
pipeline.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))

Fix the latents for all frames, and then pass our prompt and extracted pose images to the model to generate a video.

In [None]:
latents = torch.randn((1, 4, 64, 64), device='cuda', dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)

prompt = 'Darth Vader dancing in a dessert'
result = pipeline(
    prompt=[prompt]*len(pose_images),
    image=pose_images,
    latents=latents
).images
imageio.mimsave('video.mp4', result, fps=4)

##### Edge control

In [None]:
from huggingface_hub import hf_hub_download
from PIL import Image
import imageio

filename = "__assets__/poses_skeleton_gifs/dance1_corr.mp4"
repo_id = "PAIR/Text2Video-Zero"
video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)

reader = imageio.get_reader(video_path, "ffmpeg")
frame_count = 8
pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]

In [None]:
import torch
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor

model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-canny",
    torch_dtype=torch.float16
)
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
    model_id,
    controlnet=controlnet,
    torch_dtype=torch.float16
).to("cuda")

pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
pipeline.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))

In [None]:
latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)

prompt = "Darth Vader dancing in a desert"
result = pipeline(
    prompt=[prompt] * len(pose_images),
    image=pose_images,
    latents=latents
).images
imageio.mimsave("video.mp4", result, fps=4)

##### InstructPix2Pix

InstructPix2Pix allows us to use text to describe the changes we want to make to the video.

In [None]:
from huggingface_hub import hf_hub_download
from PIL import Image
import imageio

filename = "__assets__/pix2pix video/camel.mp4"
repo_id = "PAIR/Text2Video-Zero"
video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)

reader = imageio.get_reader(video_path, 'ffmpeg')
frame_count = 8
video = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]

In [None]:
import torch
from diffusers import StableDiffusionInstructPix2PixPipeline
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor

pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
    "timbrooks/instruct-pix2pix",
    torch_dtype=torch.float16
).to("cuda")
pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=3))

In [None]:
prompt = "make it Van Gogh Starry Night style"
result = pipeline(prompt=[prompt] * len(video), image=video).images
imageio.mimsave("edited_video.mp4", result, fps=4)