-
Couldn't load subscription status.
- Fork 6.5k
Description
Describe the bug
When overwriting a transformer loaded as a single file in WanImageToVideoPipeline, it gets loaded as Wan 2.1 instead of Wan 2.2, causing a bug in the encoding process. If you set pipe.transformer.config.image_dim = None after loading, it fixes the issue.
More details: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers/discussions/5
Reproduction
import torch
import numpy as np
from diffusers import WanTransformer3DModel, WanImageToVideoPipeline
from diffusers.utils import export_to_video, load_image
from PIL import Image
transformer_high_noise = WanTransformer3DModel.from_single_file(
"./high_noise.safetensors",
torch_dtype=torch.bfloat16,
device_map="auto",
low_cpu_mem_usage=False
)
transformer_low_noise = WanTransformer3DModel.from_single_file(
"./low_noise.safetensors",
torch_dtype=torch.bfloat16,
device_map="auto",
low_cpu_mem_usage=False
)
pipe = WanImageToVideoPipeline.from_pretrained(
"Wan-AI/Wan2.2-I2V-A14B-Diffusers",
transformer=transformer_high_noise, # High noise goes to main transformer
transformer_2=transformer_low_noise, # Low noise goes to transformer_2
torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
prompt = "A young girl with blue hair is walking in a cherry blossom park, petals gently falling around her. The girl wears a school uniform. The girl is smiling."
negative_prompt = "3d, cg, photo, stop, wait"
image = Image.open("./image.png")
max_area = 720 * 1280
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))
output = pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=int(16*5+1),
guidance_scale=1.0,
guidance_scale_2=1.0,
num_inference_steps=8
).frames[0]
export_to_video(output, "output.mp4", fps=16)
Logs
Loading pipeline components...: 50%|███████████████▌ | 3/6 [00:00<00:00, 6.37it/s]
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|███████████████████████████████████| 3/3 [00:00<00:00, 139.04it/s]
Loading pipeline components...: 100%|███████████████████████████████| 6/6 [00:00<00:00, 10.10it/s]
Traceback (most recent call last):
File "/home/user/path/mini_i2v.py", line 44, in <module>
output = pipe(
^^^^^
File "/home/user/path/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/user/path/.venv/lib/python3.12/site-packages/diffusers/pipelines/wan/pipeline_wan_i2v.py", line 681, in __call__
image_embeds = self.encode_image(image, device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/path/.venv/lib/python3.12/site-packages/diffusers/pipelines/wan/pipeline_wan_i2v.py", line 246, in encode_image
image = self.image_processor(images=image, return_tensors="pt").to(device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: 'NoneType' object is not callableSystem Info
diffusers @ git+https://github.com/huggingface/diffusers@f5c113e4395bc373ab540fc5a1f7490b7120c40f
Python 3.12.3
Ubuntu