From 8a78e9613d741225fb30acda65b1eb0d8cd995e5 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 9 Dec 2025 10:05:16 +0530 Subject: [PATCH 1/2] improve distributed inference cp docs. --- .../en/training/distributed_inference.md | 88 ++++++++++++++----- 1 file changed, 64 insertions(+), 24 deletions(-) diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md index f9756e1a67aa..7b328d890e37 100644 --- a/docs/source/en/training/distributed_inference.md +++ b/docs/source/en/training/distributed_inference.md @@ -237,6 +237,9 @@ By selectively loading and unloading the models you need at a given stage and sh Use [`~ModelMixin.set_attention_backend`] to switch to a more optimized attention backend. Refer to this [table](../optimization/attention_backends#available-backends) for a complete list of available backends. +> [!NOTE] +> Most attention backends are compatible with context parallelism. If one is not compatibel with context parallelism, please [file a feature request](https://github.com/huggingface/diffusers/issues/new). + ### Ring Attention Key (K) and value (V) representations communicate between devices using [Ring Attention](https://huggingface.co/papers/2310.01889). This ensures each split sees every other token's K/V. Each GPU computes attention for its local K/V and passes it to the next GPU in the ring. No single GPU holds the full sequence, which reduces communication latency. @@ -245,40 +248,56 @@ Pass a [`ContextParallelConfig`] to the `parallel_config` argument of the transf ```py import torch -from diffusers import AutoModel, QwenImagePipeline, ContextParallelConfig - -try: - torch.distributed.init_process_group("nccl") - rank = torch.distributed.get_rank() - device = torch.device("cuda", rank % torch.cuda.device_count()) +from torch import distributed as dist +from diffusers import DiffusionPipeline, ContextParallelConfig + +def setup_distributed(): + if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + rank = dist.get_rank() + device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) - - transformer = AutoModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer", torch_dtype=torch.bfloat16, parallel_config=ContextParallelConfig(ring_degree=2)) - pipeline = QwenImagePipeline.from_pretrained("Qwen/Qwen-Image", transformer=transformer, torch_dtype=torch.bfloat16, device_map="cuda") - pipeline.transformer.set_attention_backend("flash") + return device + +def main(): + device = setup_distributed() + world_size = dist.get_world_size() + + pipeline = DiffusionPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, device_map=device + ) + pipeline.transformer.set_attention_backend("_native_cudnn") + + cp_config = ContextParallelConfig(ring_degree=world_size) + pipeline.transformer.enable_parallelism(config=cp_config) prompt = """ cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain """ - + # Must specify generator so all ranks start with same latents (or pass your own) generator = torch.Generator().manual_seed(42) - image = pipeline(prompt, num_inference_steps=50, generator=generator).images[0] - - if rank == 0: - image.save("output.png") - -except Exception as e: - print(f"An error occurred: {e}") - torch.distributed.breakpoint() - raise - -finally: - if torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() + image = pipeline( + prompt, + guidance_scale=3.5, + num_inference_steps=50, + generator=generator, + ).images[0] + + if dist.get_rank() == 0: + image.save(f"output.png") + + if dist.is_initialized(): + dist.destroy_process_group() + + +if __name__ == "__main__": + main() ``` +The script above needs to be run with a distributed launcher that is compatible with PyTorch. You can use `torchrun` for this: `torchrun --nproc-per-node 2 above_script.py`. `--nproc-per-node` depends on the number of GPUs available. + ### Ulysses Attention [Ulysses Attention](https://huggingface.co/papers/2309.14509) splits a sequence across GPUs and performs an *all-to-all* communication (every device sends/receives data to every other device). Each GPU ends up with all tokens for only a subset of attention heads. Each GPU computes attention locally on all tokens for its head, then performs another all-to-all to regroup results by tokens for the next layer. @@ -288,5 +307,26 @@ finally: Pass the [`ContextParallelConfig`] to [`~ModelMixin.enable_parallelism`]. ```py +# Depending on the number of GPUs available. pipeline.transformer.enable_parallelism(config=ContextParallelConfig(ulysses_degree=2)) +``` + +### parallel_config + +It's possible to pass a `ContextParallelConfig` to `parallel_config` during initializing a model and a pipeline: + +```py +CKPT_ID = "black-forest-labs/FLUX.1-dev" + +cp_config = ContextParallelConfig(ring_degree=2) +transformer = AutoModel.from_pretrained( + CKPT_ID, + subfolder="transformer", + torch_dtype=torch.bfloat16, + parallel_config=cp_config +) + +pipeline = DiffusionPipeline.from_pretrained( + CKPT_ID, transformer=transformer, torch_dtype=torch.bfloat16, +).to(device) ``` \ No newline at end of file From b0813ccdd6d0bf24698c3851fd36dd7d3a8cb5db Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 10 Dec 2025 08:19:59 +0530 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/training/distributed_inference.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md index 7b328d890e37..534124cb93ec 100644 --- a/docs/source/en/training/distributed_inference.md +++ b/docs/source/en/training/distributed_inference.md @@ -237,8 +237,7 @@ By selectively loading and unloading the models you need at a given stage and sh Use [`~ModelMixin.set_attention_backend`] to switch to a more optimized attention backend. Refer to this [table](../optimization/attention_backends#available-backends) for a complete list of available backends. -> [!NOTE] -> Most attention backends are compatible with context parallelism. If one is not compatibel with context parallelism, please [file a feature request](https://github.com/huggingface/diffusers/issues/new). +Most attention backends are compatible with context parallelism. Open an [issue](https://github.com/huggingface/diffusers/issues/new) if a backend is not compatible. ### Ring Attention @@ -296,7 +295,11 @@ if __name__ == "__main__": main() ``` -The script above needs to be run with a distributed launcher that is compatible with PyTorch. You can use `torchrun` for this: `torchrun --nproc-per-node 2 above_script.py`. `--nproc-per-node` depends on the number of GPUs available. +The script above needs to be run with a distributed launcher, such as [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html), that is compatible with PyTorch. `--nproc-per-node` is set to the number of GPUs available. + +/```shell +`torchrun --nproc-per-node 2 above_script.py`. +/``` ### Ulysses Attention @@ -313,7 +316,7 @@ pipeline.transformer.enable_parallelism(config=ContextParallelConfig(ulysses_deg ### parallel_config -It's possible to pass a `ContextParallelConfig` to `parallel_config` during initializing a model and a pipeline: +Pass `parallel_config` during model initialization to enable context parallelism. ```py CKPT_ID = "black-forest-labs/FLUX.1-dev"