huggingface · akshan-main · Apr 16, 2026
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -449,6 +449,8 @@
             "Flux2KleinModularPipeline",
             "Flux2ModularPipeline",
             "FluxAutoBlocks",
+            "ErnieImageAutoBlocks",
+            "ErnieImageModularPipeline",
             "FluxKontextAutoBlocks",
             "FluxKontextModularPipeline",
             "FluxModularPipeline",
@@ -1234,6 +1236,8 @@
             Flux2KleinBaseModularPipeline,
             Flux2KleinModularPipeline,
             Flux2ModularPipeline,
+            ErnieImageAutoBlocks,
+            ErnieImageModularPipeline,
             FluxAutoBlocks,
             FluxKontextAutoBlocks,
             FluxKontextModularPipeline,

diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
@@ -88,6 +88,10 @@
         "QwenImageLayeredModularPipeline",
         "QwenImageLayeredAutoBlocks",
     ]
+    _import_structure["ernie_image"] = [
+        "ErnieImageAutoBlocks",
+        "ErnieImageModularPipeline",
+    ]
     _import_structure["ltx"] = [
         "LTXAutoBlocks",
         "LTXModularPipeline",
@@ -106,6 +110,7 @@
         from ..utils.dummy_pt_objects import *  # noqa F403
     else:
         from .components_manager import ComponentsManager
+        from .ernie_image import ErnieImageAutoBlocks, ErnieImageModularPipeline
         from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
         from .flux2 import (
             Flux2AutoBlocks,

diff --git a/src/diffusers/modular_pipelines/ernie_image/__init__.py b/src/diffusers/modular_pipelines/ernie_image/__init__.py
@@ -0,0 +1,47 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modular_blocks_ernie_image"] = ["ErnieImageAutoBlocks"]
+    _import_structure["modular_pipeline"] = ["ErnieImageModularPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modular_blocks_ernie_image import ErnieImageAutoBlocks
+        from .modular_pipeline import ErnieImageModularPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/modular_pipelines/ernie_image/before_denoise.py b/src/diffusers/modular_pipelines/ernie_image/before_denoise.py
@@ -0,0 +1,269 @@
+# Copyright 2025 Baidu ERNIE-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...models import ErnieImageTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import ErnieImageModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _pad_text(
+    text_hiddens: list[torch.Tensor], device: torch.device, dtype: torch.dtype, text_in_dim: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Pad a list of variable-length text hidden states to a common length and return (padded, lengths)."""
+    batch_size = len(text_hiddens)
+    if batch_size == 0:
+        return (
+            torch.zeros((0, 0, text_in_dim), device=device, dtype=dtype),
+            torch.zeros((0,), device=device, dtype=torch.long),
+        )
+    normalized = [t.squeeze(1).to(device).to(dtype) if t.dim() == 3 else t.to(device).to(dtype) for t in text_hiddens]
+    lengths = torch.tensor([t.shape[0] for t in normalized], device=device, dtype=torch.long)
+    max_length = int(lengths.max().item())
+    padded = torch.zeros((batch_size, max_length, text_in_dim), device=device, dtype=dtype)
+    for i, t in enumerate(normalized):
+        padded[i, : t.shape[0], :] = t
+    return padded, lengths
+
+
+class ErnieImageTextInputStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Input processing step that pads the variable-length text hidden states to a common length and "
+            "produces `text_bth` / `text_lens` tensors consumed by the denoiser."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", ErnieImageTransformer2DModel)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=list,
+                description="List of per-prompt text embeddings from the text encoder step.",
+            ),
+            InputParam(
+                "negative_prompt_embeds",
+                type_hint=list,
+                description="List of per-prompt negative text embeddings from the text encoder step.",
+            ),
+            InputParam(
+                "num_images_per_prompt",
+                type_hint=int,
+                default=1,
+                description="Number of images to generate per prompt.",
+            ),
+            InputParam(
+                "batch_size",
+                type_hint=int,
+                default=None,
+                description="Prompt batch size. Resolved from `prompt_embeds` when not provided.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("batch_size", type_hint=int, description="The number of prompts in the batch."),
+            OutputParam(
+                "text_bth",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Padded text hidden states of shape (B, T_max, H) fed into the transformer.",
+            ),
+            OutputParam(
+                "text_lens",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Actual per-prompt text lengths used to build the transformer attention mask.",
+            ),
+            OutputParam(
+                "negative_text_bth",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Padded negative text hidden states, when classifier-free guidance is enabled.",
+            ),
+            OutputParam(
+                "negative_text_lens",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Actual per-prompt negative text lengths, when classifier-free guidance is enabled.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+        text_in_dim = components.text_in_dim
+
+        prompt_embeds = block_state.prompt_embeds
+        block_state.batch_size = getattr(block_state, "batch_size", None) or len(prompt_embeds)
+
+        text_bth, text_lens = _pad_text(prompt_embeds, device, dtype, text_in_dim)
+        block_state.text_bth = text_bth
+        block_state.text_lens = text_lens
+
+        negative_prompt_embeds = getattr(block_state, "negative_prompt_embeds", None)
+        if negative_prompt_embeds is not None:
+            negative_text_bth, negative_text_lens = _pad_text(negative_prompt_embeds, device, dtype, text_in_dim)
+            block_state.negative_text_bth = negative_text_bth
+            block_state.negative_text_lens = negative_text_lens
+        else:
+            block_state.negative_text_bth = None
+            block_state.negative_text_lens = None
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ErnieImageSetTimestepsStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return "Step that sets the scheduler's timesteps for inference using a linear sigma schedule."
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "num_inference_steps",
+                type_hint=int,
+                default=50,
+                description="Number of denoising steps.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for inference."),
+            OutputParam("num_inference_steps", type_hint=int, description="The number of denoising steps."),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        num_inference_steps = block_state.num_inference_steps
+
+        sigmas = torch.linspace(1.0, 0.0, num_inference_steps + 1)[:-1]
+        components.scheduler.set_timesteps(sigmas=sigmas, device=device)
+
+        block_state.timesteps = components.scheduler.timesteps
+        block_state.num_inference_steps = num_inference_steps
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ErnieImagePrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "ernie-image"
+
+    @property
+    def description(self) -> str:
+        return "Prepare random noise latents for the ErnieImage text-to-image denoising process."
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", ErnieImageTransformer2DModel)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("height", type_hint=int, description="The height in pixels of the generated image."),
+            InputParam("width", type_hint=int, description="The width in pixels of the generated image."),
+            InputParam(
+                "latents",
+                type_hint=torch.Tensor,
+                description="Pre-generated noisy latents. If provided, skips noise sampling.",
+            ),
+            InputParam(
+                "num_images_per_prompt",
+                type_hint=int,
+                default=1,
+                description="Number of images to generate per prompt.",
+            ),
+            InputParam("generator", description="Torch generator for deterministic noise sampling."),
+            InputParam(
+                "batch_size",
+                required=True,
+                type_hint=int,
+                description="Prompt batch size resolved by the text input step.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("latents", type_hint=torch.Tensor, description="The initial noise latents to denoise."),
+            OutputParam("height", type_hint=int, description="The resolved image height in pixels."),
+            OutputParam("width", type_hint=int, description="The resolved image width in pixels."),
+        ]
+
+    @staticmethod
+    def _check_inputs(components: ErnieImageModularPipeline, height: int, width: int) -> None:
+        vae_scale_factor = components.vae_scale_factor
+        if height % vae_scale_factor != 0 or width % vae_scale_factor != 0:
+            raise ValueError(
+                f"`height` and `width` must be divisible by {vae_scale_factor}, got {height} and {width}."
+            )
+
+    @torch.no_grad()
+    def __call__(self, components: ErnieImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+
+        height = block_state.height or components.default_height
+        width = block_state.width or components.default_width
+        self._check_inputs(components, height, width)
+
+        total_batch_size = block_state.batch_size * block_state.num_images_per_prompt
+        latent_h = height // components.vae_scale_factor
+        latent_w = width // components.vae_scale_factor
+        num_channels_latents = components.num_channels_latents
+
+        shape = (total_batch_size, num_channels_latents, latent_h, latent_w)
+        if block_state.latents is None:
+            block_state.latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=dtype)
+        else:
+            block_state.latents = block_state.latents.to(device=device, dtype=dtype)
+
+        block_state.height = height
+        block_state.width = width
+
+        self.set_block_state(state, block_state)
+        return components, state