- 
                Notifications
    You must be signed in to change notification settings 
- Fork 6.4k
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
Enable cpu offload before enabling parallelism will raise shape error after first pipe call. It seems a bug of diffusers that cpu offload is not fully compatible with context parallelism, visa versa.
- cpu offload beforecontext parallelism (not work)
pipe.enable_model_cpu_offload(device=device)
# pipe.transformer.set_attention_backend("flash")
pipe.transformer.set_attention_backend("_native_cudnn")
pipe.transformer.enable_parallelism(
    config=ContextParallelConfig(ulysses_degree=dist.get_world_size())
)- cpu offload aftercontext parallelism (work)
# pipe.transformer.set_attention_backend("flash")
pipe.transformer.set_attention_backend("_native_cudnn")
pipe.transformer.enable_parallelism(
    config=ContextParallelConfig(ulysses_degree=dist.get_world_size())
)
pipe.enable_model_cpu_offload(device=device)Reproduction
import os
import time
import torch
import torch.distributed as dist
from diffusers import (
    QwenImagePipeline,
    QwenImageTransformer2DModel,
    ContextParallelConfig,
)
def maybe_init_distributed():
    if not dist.is_initialized():
        dist.init_process_group("nccl")
    rank = dist.get_rank()
    device = torch.device("cuda", rank % torch.cuda.device_count())
    torch.cuda.set_device(device)
    return rank, device
def maybe_destroy_distributed():
    if dist.is_initialized():
        dist.destroy_process_group()
rank, device = maybe_init_distributed()
pipe = QwenImagePipeline.from_pretrained(
    os.environ.get(
        "QWEN_IMAGE_DIR",
        "Qwen/Qwen-Image",
    ),
    torch_dtype=torch.bfloat16,
)
# NOTE: Enable cpu offload before enabling parallelism will
# raise shape error after first pipe call, so we enable it after.
# It seems a bug of diffusers that cpu offload is not fully
# compatible with context parallelism, visa versa.
pipe.enable_model_cpu_offload(device=device)
assert isinstance(pipe.transformer, QwenImageTransformer2DModel)
# pipe.transformer.set_attention_backend("flash")
pipe.transformer.set_attention_backend("_native_cudnn")
pipe.transformer.enable_parallelism(
    config=ContextParallelConfig(ulysses_degree=dist.get_world_size())
)
# NOTE: Enable cpu offload after enabling parallelism
# pipe.enable_model_cpu_offload(device=device)
# assert isinstance(pipe.vae, AutoencoderKLQwenImage)
# pipe.vae.enable_tiling()
positive_magic = {
    "en": ", Ultra HD, 4K, cinematic composition.",  # for english prompt
    "zh": ", 超清,4K,电影级构图.",  # for chinese prompt
}
# Generate image
prompt = """A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition"""
# using an empty string if you do not have specific concept to remove
negative_prompt = " "
pipe.set_progress_bar_config(disable=rank != 0)
def run_pipe():
    # do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
    image = pipe(
        prompt=prompt + positive_magic["en"],
        negative_prompt=negative_prompt,
        width=1024,
        height=1024,
        num_inference_steps=50,
        true_cfg_scale=4.0,
        generator=torch.Generator(device="cpu").manual_seed(42),
    ).images[0]
    return image
# warmup
_ = run_pipe() # always work
start = time.time()
image = run_pipe() # raise error here if cpu offload is enabled before parallelism
end = time.time()
if rank == 0:
    time_cost = end - start
    save_path = f"qwen-image.cp{dist.get_world_size()}.png"
    print(f"Time cost: {time_cost:.2f}s")
    print(f"Saving image to {save_path}")
    image.save(save_path)
maybe_destroy_distributed()Logs
Error:
[rank0]: Traceback (most recent call last):
[rank0]:   File "/workspace/dev/vipshop/cache-dit/examples/parallelism/run_qwen_image_cp_naive.py", line 71, in <module>
[rank0]:     start = time.time()
[rank0]:             ^^^^^^^^^^
[rank0]:   File "/workspace/dev/vipshop/cache-dit/examples/parallelism/run_qwen_image_cp_naive.py", line 54, in run_pipe
[rank0]:     # do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
[rank0]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
[rank0]:     return func(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/workspace/dev/vipshop/diffusers/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py", line 691, in __call__
[rank0]:     noise_pred = self.transformer(
[rank0]:                  ^^^^^^^^^^^^^^^^^
[rank0]:   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank0]:     return self._call_impl(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
[rank0]:     return forward_call(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/usr/local/lib/python3.12/dist-packages/accelerate/hooks.py", line 175, in new_forward
[rank0]:     output = module._old_forward(*args, **kwargs)
[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/workspace/dev/vipshop/diffusers/src/diffusers/models/transformers/transformer_qwenimage.py", line 647, in forward
[rank0]:     encoder_hidden_states, hidden_states = block(
[rank0]:                                            ^^^^^^
[rank0]:   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank0]:     return self._call_impl(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
[rank0]:     return forward_call(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/workspace/dev/vipshop/diffusers/src/diffusers/models/transformers/transformer_qwenimage.py", line 443, in forward
[rank0]:     attn_output = self.attn(
[rank0]:                   ^^^^^^^^^^
[rank0]:   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank0]:     return self._call_impl(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
[rank0]:     return forward_call(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/workspace/dev/vipshop/diffusers/src/diffusers/models/attention_processor.py", line 605, in forward
[rank0]:     return self.processor(
[rank0]:            ^^^^^^^^^^^^^^^
[rank0]:   File "/workspace/dev/vipshop/diffusers/src/diffusers/models/transformers/transformer_qwenimage.py", line 322, in __call__
[rank0]:     img_query = apply_rotary_emb_qwen(img_query, img_freqs, use_real=False)
[rank0]:                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/workspace/dev/vipshop/diffusers/src/diffusers/models/transformers/transformer_qwenimage.py", line 139, in apply_rotary_emb_qwen
[rank0]:     x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
[rank0]:                                ~~~~~~~~~~^~~~~~~~~~~
[rank0]: RuntimeError: The size of tensor a (4096) must match the size of tensor b (2048) at non-singleton dimension 1System Info
diffusers 0.36.dev0 (latest main branch), pytorch 2.9.0
Who can help?
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working