Skip to content
20 changes: 13 additions & 7 deletions tests/pipelines/qwenimage/test_qwenimage_edit.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import unittest

import numpy as np
import pytest
import torch
from PIL import Image
from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
Expand Down Expand Up @@ -134,15 +133,17 @@ def get_dummy_inputs(self, device, seed=0):
else:
generator = torch.Generator(device=device).manual_seed(seed)

# Even if we specify smaller dimensions for the images, it won't work because of how
# the internal implementation enforces a minimal resolution of 1024x1024.
inputs = {
"prompt": "dance monkey",
"image": Image.new("RGB", (32, 32)),
"image": Image.new("RGB", (1024, 1024)),
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we are making the dummy image input much bigger, are the QwenImageEditPipelineFastTests too heavy for the CI? Running

pytest tests/pipelines/qwenimage/test_qwenimage_edit.py::QwenImageEditPipelineFastTests

takes ~4 minutes on a A100 on DGX.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Observe how txt changes here when encode_prompt() is called in isolation as opposed to with pipe(...). First is when encode_prompt in isolation:

txt=["<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>dance monkey<|im_end|>\n<|im_start|>assistant\n"], image.size=(32, 32)

Second is with pipe(...):

txt=["<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>dance monkey<|im_end|>\n<|im_start|>assistant\n"], image.size=(1024, 1024)

This changes (image.size being different because of this) the prompt_embeds and this causes the assertion to fail. Fixing the image size to (1024, 1024) doesn't change the prompt_image.

But I am also concerned about the runtime increase because of increasing the image resolution like this. Maybe, we just keep the PR open for now unless it's been requested for this model by the community?

"negative_prompt": "bad quality",
"generator": generator,
"num_inference_steps": 2,
"true_cfg_scale": 1.0,
"height": 32,
"width": 32,
"height": 1024,
"width": 1024,
"max_sequence_length": 16,
"output_type": "pt",
}
Expand Down Expand Up @@ -238,6 +239,11 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
"VAE tiling should not affect the inference results",
)

@pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True)
def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4):
super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol)
def test_encode_prompt_works_in_isolation(
self, extra_required_param_value_dict=None, keep_params=None, atol=1e-4, rtol=1e-4
):
# We include `image` because it's needed in both `encode_prompt` and some other subsequent calculations.
# `max_sequence_length` to maintain parity between its value during all invocations of `encode_prompt`
# in the following test.
keep_params = ["image", "max_sequence_length"]
super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, keep_params, atol, rtol)
19 changes: 13 additions & 6 deletions tests/pipelines/qwenimage/test_qwenimage_edit_plus.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,16 +134,18 @@ def get_dummy_inputs(self, device, seed=0):
else:
generator = torch.Generator(device=device).manual_seed(seed)

image = Image.new("RGB", (32, 32))
# Even if we specify smaller dimensions for the images, it won't work because of how
# the internal implementation enforces a minimal resolution of 384*384.
image = Image.new("RGB", (384, 384))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment to #12403 (comment), but may not be as big of an issue since the dummy image is smaller here.

inputs = {
"prompt": "dance monkey",
"image": [image, image],
"negative_prompt": "bad quality",
"generator": generator,
"num_inference_steps": 2,
"true_cfg_scale": 1.0,
"height": 32,
"width": 32,
"height": 384,
"width": 384,
"max_sequence_length": 16,
"output_type": "pt",
}
Expand Down Expand Up @@ -236,9 +238,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
"VAE tiling should not affect the inference results",
)

@pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True)
def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4):
super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol)
def test_encode_prompt_works_in_isolation(
self, extra_required_param_value_dict=None, keep_params=None, atol=1e-4, rtol=1e-4
):
# We include `image` because it's needed in both `encode_prompt` and some other subsequent calculations.
# `max_sequence_length` to maintain parity between its value during all invocations of `encode_prompt`
# in the following test.
keep_params = ["image", "max_sequence_length"]
super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, keep_params, atol, rtol)

@pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True)
def test_num_images_per_prompt():
Expand Down
53 changes: 28 additions & 25 deletions tests/pipelines/test_pipelines_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import tempfile
import unittest
import uuid
from typing import Any, Callable, Dict, Union
from typing import Any, Callable, Dict, List, Optional, Union

import numpy as np
import PIL.Image
Expand Down Expand Up @@ -2082,20 +2082,26 @@ def test_loading_with_incorrect_variants_raises_error(self):

assert f"You are trying to load the model files of the `variant={variant}`" in str(error.exception)

def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4):
def test_encode_prompt_works_in_isolation(
self,
extra_required_param_value_dict: Optional[Dict] = None,
keep_params: Optional[List] = None,
atol=1e-4,
rtol=1e-4,
):
if not hasattr(self.pipeline_class, "encode_prompt"):
return

components = self.get_dummy_components()

def _contains_text_key(name):
return any(token in name for token in ("text", "tokenizer", "processor"))

# We initialize the pipeline with only text encoders and tokenizers,
# mimicking a real-world scenario.
components_with_text_encoders = {}
for k in components:
if "text" in k or "tokenizer" in k:
components_with_text_encoders[k] = components[k]
else:
components_with_text_encoders[k] = None
components_with_text_encoders = {
name: component if _contains_text_key(name) else None for name, component in components.items()
}
pipe_with_just_text_encoder = self.pipeline_class(**components_with_text_encoders)
pipe_with_just_text_encoder = pipe_with_just_text_encoder.to(torch_device)

Expand All @@ -2105,17 +2111,18 @@ def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=
encode_prompt_parameters = list(encode_prompt_signature.parameters.values())

# Required args in encode_prompt with those with no default.
required_params = []
for param in encode_prompt_parameters:
if param.name == "self" or param.name == "kwargs":
continue
if param.default is inspect.Parameter.empty:
required_params.append(param.name)
required_params = [
param.name
for param in encode_prompt_parameters
if param.name not in {"self", "kwargs"} and param.default is inspect.Parameter.empty
]

# Craft inputs for the `encode_prompt()` method to run in isolation.
encode_prompt_param_names = [p.name for p in encode_prompt_parameters if p.name != "self"]
input_keys = list(inputs.keys())
encode_prompt_inputs = {k: inputs.pop(k) for k in input_keys if k in encode_prompt_param_names}
encode_prompt_inputs = {name: inputs[name] for name in encode_prompt_param_names if name in inputs}
for name in encode_prompt_param_names:
if name in inputs and (not keep_params or name not in keep_params):
inputs.pop(name)

pipe_call_signature = inspect.signature(pipe_with_just_text_encoder.__call__)
pipe_call_parameters = pipe_call_signature.parameters
Expand Down Expand Up @@ -2150,18 +2157,15 @@ def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=

# Pack the outputs of `encode_prompt`.
adapted_prompt_embeds_kwargs = {
k: prompt_embeds_kwargs.pop(k) for k in list(prompt_embeds_kwargs.keys()) if k in pipe_call_parameters
name: prompt_embeds_kwargs[name] for name in prompt_embeds_kwargs if name in pipe_call_parameters
}

# now initialize a pipeline without text encoders and compute outputs with the
# `encode_prompt()` outputs and other relevant inputs.
components_with_text_encoders = {}
for k in components:
if "text" in k or "tokenizer" in k:
components_with_text_encoders[k] = None
else:
components_with_text_encoders[k] = components[k]
pipe_without_text_encoders = self.pipeline_class(**components_with_text_encoders).to(torch_device)
components_without_text_encoders = {
name: None if _contains_text_key(name) else component for name, component in components.items()
}
pipe_without_text_encoders = self.pipeline_class(**components_without_text_encoders).to(torch_device)

# Set `negative_prompt` to None as we have already calculated its embeds
# if it was present in `inputs`. This is because otherwise we will interfere wrongly
Expand All @@ -2181,7 +2185,6 @@ def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=
and pipe_call_parameters.get("prompt_embeds").default is None
):
pipe_without_tes_inputs.update({"prompt": None})

pipe_out = pipe_without_text_encoders(**pipe_without_tes_inputs)[0]

# Compare against regular pipeline outputs.
Expand Down
Loading