Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/en/api/models/wan_animate_transformer_3d.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ The model can be loaded with the following code snippet.
```python
from diffusers import WanAnimateTransformer3DModel

transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-720P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
```

## WanAnimateTransformer3DModel
Expand Down
48 changes: 18 additions & 30 deletions docs/source/en/api/pipelines/wan.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ For replacement mode, you additionally need:
- **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black)

> [!NOTE]
> The preprocessing tools are available in the original Wan-Animate repository. Integration of these preprocessing steps into Diffusers is planned for a future release.
> Raw videos should not be used for inputs such as `pose_video`, which the pipeline expects to be preprocessed to extract the proper information. Preprocessing scripts to prepare these inputs are available in the [original Wan-Animate repository](https://github.com/Wan-Video/Wan2.2?tab=readme-ov-file#1-preprocessing). Integration of these preprocessing steps into Diffusers is planned for a future release.
The example below demonstrates how to use the Wan-Animate pipeline:

Expand All @@ -293,13 +293,10 @@ import numpy as np
import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video
from transformers import CLIPVisionModel

model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained(
model_id, vae=vae, torch_dtype=torch.bfloat16
)
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")

# Load character image and preprocessed videos
Expand Down Expand Up @@ -330,11 +327,11 @@ output = pipe(
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=81,
guidance_scale=5.0,
mode="animation", # Animation mode (default)
segment_frame_length=77,
guidance_scale=1.0,
mode="animate", # Animation mode (default)
).frames[0]
export_to_video(output, "animated_character.mp4", fps=16)
export_to_video(output, "animated_character.mp4", fps=30)
```

</hfoption>
Expand All @@ -345,14 +342,10 @@ import numpy as np
import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video
from transformers import CLIPVisionModel

model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float16)
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained(
model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
)
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")

# Load all required inputs for replacement mode
Expand Down Expand Up @@ -387,11 +380,11 @@ output = pipe(
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=81,
guidance_scale=5.0,
mode="replacement", # Replacement mode
segment_frame_lengths=77,
guidance_scale=1.0,
mode="replace", # Replacement mode
).frames[0]
export_to_video(output, "character_replaced.mp4", fps=16)
export_to_video(output, "character_replaced.mp4", fps=30)
```

</hfoption>
Expand All @@ -402,14 +395,10 @@ import numpy as np
import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video
from transformers import CLIPVisionModel

model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float16)
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained(
model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
)
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")

image = load_image("path/to/character.jpg")
Expand Down Expand Up @@ -443,25 +432,24 @@ output = pipe(
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=81,
segment_frame_length=77,
num_inference_steps=50,
guidance_scale=5.0,
num_frames_for_temporal_guidance=5, # Use 5 frames for temporal guidance (1 or 5 recommended)
prev_segment_conditioning_frames=5, # Use 5 frames for temporal guidance (1 or 5 recommended)
callback_on_step_end=callback_fn,
callback_on_step_end_tensor_inputs=["latents"],
).frames[0]
export_to_video(output, "animated_advanced.mp4", fps=16)
export_to_video(output, "animated_advanced.mp4", fps=30)
```

</hfoption>
</hfoptions>

#### Key Parameters

- **mode**: Choose between `"animation"` (default) or `"replacement"`
- **num_frames_for_temporal_guidance**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt
- **num_frames**: Total number of frames to generate. Should be divisible by `vae_scale_factor_temporal` (default: 4)
- **mode**: Choose between `"animate"` (default) or `"replace"`
- **prev_segment_conditioning_frames**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt. For Wan-Animate, CFG is disabled by default (`guidance_scale=1.0`) but can be enabled to support negative prompts and finer control over facial expressions. (Note that CFG will only target the text prompt and face conditioning.)


## Notes
Expand Down