diff --git a/docs/source/en/api/models/wan_animate_transformer_3d.md b/docs/source/en/api/models/wan_animate_transformer_3d.md index 798afc72fb8e..cc7b3f0c408c 100644 --- a/docs/source/en/api/models/wan_animate_transformer_3d.md +++ b/docs/source/en/api/models/wan_animate_transformer_3d.md @@ -18,7 +18,7 @@ The model can be loaded with the following code snippet. ```python from diffusers import WanAnimateTransformer3DModel -transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-720P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16) +transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16) ``` ## WanAnimateTransformer3DModel diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md index 3993e2efd0c8..6aab6c5b33b9 100644 --- a/docs/source/en/api/pipelines/wan.md +++ b/docs/source/en/api/pipelines/wan.md @@ -281,7 +281,7 @@ For replacement mode, you additionally need: - **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black) > [!NOTE] -> The preprocessing tools are available in the original Wan-Animate repository. Integration of these preprocessing steps into Diffusers is planned for a future release. +> Raw videos should not be used for inputs such as `pose_video`, which the pipeline expects to be preprocessed to extract the proper information. Preprocessing scripts to prepare these inputs are available in the [original Wan-Animate repository](https://github.com/Wan-Video/Wan2.2?tab=readme-ov-file#1-preprocessing). Integration of these preprocessing steps into Diffusers is planned for a future release. The example below demonstrates how to use the Wan-Animate pipeline: @@ -293,13 +293,10 @@ import numpy as np import torch from diffusers import AutoencoderKLWan, WanAnimatePipeline from diffusers.utils import export_to_video, load_image, load_video -from transformers import CLIPVisionModel model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers" vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) -pipe = WanAnimatePipeline.from_pretrained( - model_id, vae=vae, torch_dtype=torch.bfloat16 -) +pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16) pipe.to("cuda") # Load character image and preprocessed videos @@ -330,11 +327,11 @@ output = pipe( negative_prompt=negative_prompt, height=height, width=width, - num_frames=81, - guidance_scale=5.0, - mode="animation", # Animation mode (default) + segment_frame_length=77, + guidance_scale=1.0, + mode="animate", # Animation mode (default) ).frames[0] -export_to_video(output, "animated_character.mp4", fps=16) +export_to_video(output, "animated_character.mp4", fps=30) ``` @@ -345,14 +342,10 @@ import numpy as np import torch from diffusers import AutoencoderKLWan, WanAnimatePipeline from diffusers.utils import export_to_video, load_image, load_video -from transformers import CLIPVisionModel model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers" -image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float16) vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) -pipe = WanAnimatePipeline.from_pretrained( - model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16 -) +pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16) pipe.to("cuda") # Load all required inputs for replacement mode @@ -387,11 +380,11 @@ output = pipe( negative_prompt=negative_prompt, height=height, width=width, - num_frames=81, - guidance_scale=5.0, - mode="replacement", # Replacement mode + segment_frame_lengths=77, + guidance_scale=1.0, + mode="replace", # Replacement mode ).frames[0] -export_to_video(output, "character_replaced.mp4", fps=16) +export_to_video(output, "character_replaced.mp4", fps=30) ``` @@ -402,14 +395,10 @@ import numpy as np import torch from diffusers import AutoencoderKLWan, WanAnimatePipeline from diffusers.utils import export_to_video, load_image, load_video -from transformers import CLIPVisionModel model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers" -image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float16) vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) -pipe = WanAnimatePipeline.from_pretrained( - model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16 -) +pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16) pipe.to("cuda") image = load_image("path/to/character.jpg") @@ -443,14 +432,14 @@ output = pipe( negative_prompt=negative_prompt, height=height, width=width, - num_frames=81, + segment_frame_length=77, num_inference_steps=50, guidance_scale=5.0, - num_frames_for_temporal_guidance=5, # Use 5 frames for temporal guidance (1 or 5 recommended) + prev_segment_conditioning_frames=5, # Use 5 frames for temporal guidance (1 or 5 recommended) callback_on_step_end=callback_fn, callback_on_step_end_tensor_inputs=["latents"], ).frames[0] -export_to_video(output, "animated_advanced.mp4", fps=16) +export_to_video(output, "animated_advanced.mp4", fps=30) ``` @@ -458,10 +447,9 @@ export_to_video(output, "animated_advanced.mp4", fps=16) #### Key Parameters -- **mode**: Choose between `"animation"` (default) or `"replacement"` -- **num_frames_for_temporal_guidance**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory -- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt -- **num_frames**: Total number of frames to generate. Should be divisible by `vae_scale_factor_temporal` (default: 4) +- **mode**: Choose between `"animate"` (default) or `"replace"` +- **prev_segment_conditioning_frames**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory +- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt. For Wan-Animate, CFG is disabled by default (`guidance_scale=1.0`) but can be enabled to support negative prompts and finer control over facial expressions. (Note that CFG will only target the text prompt and face conditioning.) ## Notes