huggingface · NazarKozak · Apr 25, 2026 · Apr 27, 2026
diff --git a/docs/source/en/api/pipelines/flux2.md b/docs/source/en/api/pipelines/flux2.md
@@ -32,6 +32,17 @@ Flux.2 can potentially generate better better outputs with better prompts. We ca
 an input prompt by setting the `caption_upsample_temperature` argument in the pipeline call arguments.
 The [official implementation](https://github.com/black-forest-labs/flux2/blob/5a5d316b1b42f6b59a8c9194b77c8256be848432/src/flux2/text_encoder.py#L140) recommends this value to be 0.15.
 
+## Reference conditioning vs. img2img
+
+The `image` argument on [`Flux2Pipeline`] and [`Flux2KleinPipeline`] is a *reference conditioning*. Reference images are encoded as additional attention tokens that flow through the
+transformer alongside the text prompt. Flux.2 doesn't add noise to the input image unlike [`FluxImg2ImgPipeline`]. Passing `strength` to [`Flux2Pipeline`] raises:
+
+```
+TypeError: Flux2Pipeline.__call__() got an unexpected keyword argument 'strength'
+```
+
+Drop the `strength` argument and pass references with `image`. For inpainting, use [`Flux2KleinInpaintPipeline`] instead.
+
 ## Flux2Pipeline
 
 [[autodoc]] Flux2Pipeline

diff --git a/src/diffusers/pipelines/flux2/pipeline_flux2.py b/src/diffusers/pipelines/flux2/pipeline_flux2.py
@@ -769,11 +769,15 @@ def __call__(
 
         Args:
             image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`):
-                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
-                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
-                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
-                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
-                latents as `image`, but if passing latents directly it is not encoded again.
+                Reference image(s) used to condition generation. Flux.2 encodes them as additional attention tokens that
+                flow through the transformer alongside the text prompt — this is **reference conditioning**, not
+                SD/Flux.1 style img2img, so there is no companion `strength` argument. Pass a list to provide multiple
+                references.
+
+                For both numpy array and pytorch tensor, the expected value range is between `[0, 1]`. If it's a tensor
+                or a list of tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy
+                array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. Can also accept
+                image latents directly, in which case they will not be re-encoded.
             prompt (`str` or `list[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.

diff --git a/src/diffusers/pipelines/flux2/pipeline_flux2_klein.py b/src/diffusers/pipelines/flux2/pipeline_flux2_klein.py
@@ -635,11 +635,15 @@ def __call__(
 
         Args:
             image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
-                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
-                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
-                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
-                latents as `image`, but if passing latents directly it is not encoded again.
+                Reference image(s) used to condition generation. Flux.2 encodes them as additional attention tokens that
+                flow through the transformer alongside the text prompt — this is **reference conditioning**, not
+                SD/Flux.1 style img2img, so there is no companion `strength` argument. Pass a list to provide multiple
+                references.
+
+                For both numpy array and pytorch tensor, the expected value range is between `[0, 1]`. If it's a tensor
+                or a list of tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy
+                array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. Can also accept
+                image latents directly, in which case they will not be re-encoded.
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.