huggingface · sayakpaul · Sep 26, 2025 · Sep 26, 2025 · Sep 28, 2025 · Sep 28, 2025
diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit.py b/tests/pipelines/qwenimage/test_qwenimage_edit.py
@@ -15,7 +15,6 @@
 import unittest
 
 import numpy as np
-import pytest
 import torch
 from PIL import Image
 from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
@@ -134,15 +133,17 @@ def get_dummy_inputs(self, device, seed=0):
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
 
+        # Even if we specify smaller dimensions for the images, it won't work because of how
+        # the internal implementation enforces a minimal resolution of 1024x1024.
         inputs = {
             "prompt": "dance monkey",
-            "image": Image.new("RGB", (32, 32)),
+            "image": Image.new("RGB", (1024, 1024)),
             "negative_prompt": "bad quality",
             "generator": generator,
             "num_inference_steps": 2,
             "true_cfg_scale": 1.0,
-            "height": 32,
-            "width": 32,
+            "height": 1024,
+            "width": 1024,
             "max_sequence_length": 16,
             "output_type": "pt",
         }
@@ -238,6 +239,11 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             "VAE tiling should not affect the inference results",
         )
 
-    @pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True)
-    def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4):
-        super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol)
+    def test_encode_prompt_works_in_isolation(
+        self, extra_required_param_value_dict=None, keep_params=None, atol=1e-4, rtol=1e-4
+    ):
+        # We include `image` because it's needed in both `encode_prompt` and some other subsequent calculations.
+        # `max_sequence_length` to maintain parity between its value during all invocations of `encode_prompt`
+        # in the following test.
+        keep_params = ["image", "max_sequence_length"]
+        super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, keep_params, atol, rtol)
diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit_plus.py b/tests/pipelines/qwenimage/test_qwenimage_edit_plus.py
@@ -134,16 +134,18 @@ def get_dummy_inputs(self, device, seed=0):
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
 
-        image = Image.new("RGB", (32, 32))
+        # Even if we specify smaller dimensions for the images, it won't work because of how
+        # the internal implementation enforces a minimal resolution of 384*384.
+        image = Image.new("RGB", (384, 384))
         inputs = {
             "prompt": "dance monkey",
             "image": [image, image],
             "negative_prompt": "bad quality",
             "generator": generator,
             "num_inference_steps": 2,
             "true_cfg_scale": 1.0,
-            "height": 32,
-            "width": 32,
+            "height": 384,
+            "width": 384,
             "max_sequence_length": 16,
             "output_type": "pt",
         }
@@ -236,9 +238,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             "VAE tiling should not affect the inference results",
         )
 
-    @pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True)
-    def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4):
-        super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol)
+    def test_encode_prompt_works_in_isolation(
+        self, extra_required_param_value_dict=None, keep_params=None, atol=1e-4, rtol=1e-4
+    ):
+        # We include `image` because it's needed in both `encode_prompt` and some other subsequent calculations.
+        # `max_sequence_length` to maintain parity between its value during all invocations of `encode_prompt`
+        # in the following test.
+        keep_params = ["image", "max_sequence_length"]
+        super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, keep_params, atol, rtol)
 
     @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True)
     def test_num_images_per_prompt():

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
@@ -5,7 +5,7 @@
 import tempfile
 import unittest
 import uuid
-from typing import Any, Callable, Dict, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import PIL.Image
@@ -2082,20 +2082,26 @@ def test_loading_with_incorrect_variants_raises_error(self):
 
             assert f"You are trying to load the model files of the `variant={variant}`" in str(error.exception)
 
-    def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4):
+    def test_encode_prompt_works_in_isolation(
+        self,
+        extra_required_param_value_dict: Optional[Dict] = None,
+        keep_params: Optional[List] = None,
+        atol=1e-4,
+        rtol=1e-4,
+    ):
         if not hasattr(self.pipeline_class, "encode_prompt"):
             return
 
         components = self.get_dummy_components()
 
+        def _contains_text_key(name):
+            return any(token in name for token in ("text", "tokenizer", "processor"))
+
         # We initialize the pipeline with only text encoders and tokenizers,
         # mimicking a real-world scenario.
-        components_with_text_encoders = {}
-        for k in components:
-            if "text" in k or "tokenizer" in k:
-                components_with_text_encoders[k] = components[k]
-            else:
-                components_with_text_encoders[k] = None
+        components_with_text_encoders = {
+            name: component if _contains_text_key(name) else None for name, component in components.items()
+        }
         pipe_with_just_text_encoder = self.pipeline_class(**components_with_text_encoders)
         pipe_with_just_text_encoder = pipe_with_just_text_encoder.to(torch_device)
 
@@ -2105,17 +2111,18 @@ def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=
         encode_prompt_parameters = list(encode_prompt_signature.parameters.values())
 
         # Required args in encode_prompt with those with no default.
-        required_params = []
-        for param in encode_prompt_parameters:
-            if param.name == "self" or param.name == "kwargs":
-                continue
-            if param.default is inspect.Parameter.empty:
-                required_params.append(param.name)
+        required_params = [
+            param.name
+            for param in encode_prompt_parameters
+            if param.name not in {"self", "kwargs"} and param.default is inspect.Parameter.empty
+        ]
 
         # Craft inputs for the `encode_prompt()` method to run in isolation.
         encode_prompt_param_names = [p.name for p in encode_prompt_parameters if p.name != "self"]
-        input_keys = list(inputs.keys())
-        encode_prompt_inputs = {k: inputs.pop(k) for k in input_keys if k in encode_prompt_param_names}
+        encode_prompt_inputs = {name: inputs[name] for name in encode_prompt_param_names if name in inputs}
+        for name in encode_prompt_param_names:
+            if name in inputs and (not keep_params or name not in keep_params):
+                inputs.pop(name)
 
         pipe_call_signature = inspect.signature(pipe_with_just_text_encoder.__call__)
         pipe_call_parameters = pipe_call_signature.parameters
@@ -2150,18 +2157,15 @@ def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=
 
         # Pack the outputs of `encode_prompt`.
         adapted_prompt_embeds_kwargs = {
-            k: prompt_embeds_kwargs.pop(k) for k in list(prompt_embeds_kwargs.keys()) if k in pipe_call_parameters
+            name: prompt_embeds_kwargs[name] for name in prompt_embeds_kwargs if name in pipe_call_parameters
         }
 
         # now initialize a pipeline without text encoders and compute outputs with the
         # `encode_prompt()` outputs and other relevant inputs.
-        components_with_text_encoders = {}
-        for k in components:
-            if "text" in k or "tokenizer" in k:
-                components_with_text_encoders[k] = None
-            else:
-                components_with_text_encoders[k] = components[k]
-        pipe_without_text_encoders = self.pipeline_class(**components_with_text_encoders).to(torch_device)
+        components_without_text_encoders = {
+            name: None if _contains_text_key(name) else component for name, component in components.items()
+        }
+        pipe_without_text_encoders = self.pipeline_class(**components_without_text_encoders).to(torch_device)
 
         # Set `negative_prompt` to None as we have already calculated its embeds
         # if it was present in `inputs`. This is because otherwise we will interfere wrongly
@@ -2181,7 +2185,6 @@ def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=
             and pipe_call_parameters.get("prompt_embeds").default is None
         ):
             pipe_without_tes_inputs.update({"prompt": None})
-
         pipe_out = pipe_without_text_encoders(**pipe_without_tes_inputs)[0]
 
         # Compare against regular pipeline outputs.