diff --git a/tests/models/transformers/test_models_transformer_hunyuan_1_5.py b/tests/models/transformers/test_models_transformer_hunyuan_1_5.py new file mode 100644 index 000000000000..021fcdc9cfbf --- /dev/null +++ b/tests/models/transformers/test_models_transformer_hunyuan_1_5.py @@ -0,0 +1,100 @@ +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import torch + +from diffusers import HunyuanVideo15Transformer3DModel + +from ...testing_utils import enable_full_determinism, torch_device +from ..test_modeling_common import ModelTesterMixin + + +enable_full_determinism() + + +class HunyuanVideo15Transformer3DTests(ModelTesterMixin, unittest.TestCase): + model_class = HunyuanVideo15Transformer3DModel + main_input_name = "hidden_states" + uses_custom_attn_processor = True + + text_embed_dim = 16 + text_embed_2_dim = 8 + image_embed_dim = 12 + + @property + def dummy_input(self): + batch_size = 1 + num_channels = 4 + num_frames = 1 + height = 8 + width = 8 + sequence_length = 6 + sequence_length_2 = 4 + image_sequence_length = 3 + + hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device) + timestep = torch.tensor([1.0]).to(torch_device) + encoder_hidden_states = torch.randn((batch_size, sequence_length, self.text_embed_dim), device=torch_device) + encoder_hidden_states_2 = torch.randn( + (batch_size, sequence_length_2, self.text_embed_2_dim), device=torch_device + ) + encoder_attention_mask = torch.ones((batch_size, sequence_length), device=torch_device) + encoder_attention_mask_2 = torch.ones((batch_size, sequence_length_2), device=torch_device) + # All zeros for inducing T2V path in the model. + image_embeds = torch.zeros((batch_size, image_sequence_length, self.image_embed_dim), device=torch_device) + + return { + "hidden_states": hidden_states, + "timestep": timestep, + "encoder_hidden_states": encoder_hidden_states, + "encoder_attention_mask": encoder_attention_mask, + "encoder_hidden_states_2": encoder_hidden_states_2, + "encoder_attention_mask_2": encoder_attention_mask_2, + "image_embeds": image_embeds, + } + + @property + def input_shape(self): + return (4, 1, 8, 8) + + @property + def output_shape(self): + return (4, 1, 8, 8) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "in_channels": 4, + "out_channels": 4, + "num_attention_heads": 2, + "attention_head_dim": 8, + "num_layers": 2, + "num_refiner_layers": 1, + "mlp_ratio": 2.0, + "patch_size": 1, + "patch_size_t": 1, + "text_embed_dim": self.text_embed_dim, + "text_embed_2_dim": self.text_embed_2_dim, + "image_embed_dim": self.image_embed_dim, + "rope_axes_dim": (2, 2, 4), + "target_size": 16, + "task_type": "t2v", + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"HunyuanVideo15Transformer3DModel"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) diff --git a/tests/pipelines/hunyuan_video1_5/__init__.py b/tests/pipelines/hunyuan_video1_5/__init__.py new file mode 100644 index 000000000000..8fb044d9cf83 --- /dev/null +++ b/tests/pipelines/hunyuan_video1_5/__init__.py @@ -0,0 +1 @@ +# Copyright 2025 The HuggingFace Team. diff --git a/tests/pipelines/hunyuan_video1_5/test_hunyuan_1_5.py b/tests/pipelines/hunyuan_video1_5/test_hunyuan_1_5.py new file mode 100644 index 000000000000..2d8cc8f257f6 --- /dev/null +++ b/tests/pipelines/hunyuan_video1_5/test_hunyuan_1_5.py @@ -0,0 +1,195 @@ +# Copyright 2025 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import torch +from transformers import ByT5Tokenizer, Qwen2_5_VLTextConfig, Qwen2_5_VLTextModel, Qwen2Tokenizer, T5EncoderModel + +from diffusers import ( + AutoencoderKLHunyuanVideo15, + FlowMatchEulerDiscreteScheduler, + HunyuanVideo15Pipeline, + HunyuanVideo15Transformer3DModel, +) +from diffusers.guiders import ClassifierFreeGuidance + +from ...testing_utils import enable_full_determinism +from ..test_pipelines_common import PipelineTesterMixin + + +enable_full_determinism() + + +class HunyuanVideo15PipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = HunyuanVideo15Pipeline + params = frozenset( + [ + "prompt", + "negative_prompt", + "height", + "width", + "prompt_embeds", + "prompt_embeds_mask", + "negative_prompt_embeds", + "negative_prompt_embeds_mask", + "prompt_embeds_2", + "prompt_embeds_mask_2", + "negative_prompt_embeds_2", + "negative_prompt_embeds_mask_2", + ] + ) + batch_params = ["prompt", "negative_prompt"] + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "return_dict" + ] + ) + test_attention_slicing = False + test_xformers_attention = False + test_layerwise_casting = True + test_group_offloading = False + supports_dduf = False + + def get_dummy_components(self, num_layers: int = 1): + torch.manual_seed(0) + transformer = HunyuanVideo15Transformer3DModel( + in_channels=9, + out_channels=4, + num_attention_heads=2, + attention_head_dim=8, + num_layers=num_layers, + num_refiner_layers=1, + mlp_ratio=2.0, + patch_size=1, + patch_size_t=1, + text_embed_dim=16, + text_embed_2_dim=32, + image_embed_dim=12, + rope_axes_dim=(2, 2, 4), + target_size=16, + task_type="t2v", + ) + + torch.manual_seed(0) + vae = AutoencoderKLHunyuanVideo15( + in_channels=3, + out_channels=3, + latent_channels=4, + block_out_channels=(16, 16), + layers_per_block=1, + spatial_compression_ratio=4, + temporal_compression_ratio=2, + downsample_match_channel=False, + upsample_match_channel=False, + ) + + torch.manual_seed(0) + scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0) + + torch.manual_seed(0) + qwen_config = Qwen2_5_VLTextConfig( + **{ + "hidden_size": 16, + "intermediate_size": 16, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "rope_scaling": { + "mrope_section": [1, 1, 2], + "rope_type": "default", + "type": "default", + }, + "rope_theta": 1000000.0, + } + ) + text_encoder = Qwen2_5_VLTextModel(qwen_config) + tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration") + + torch.manual_seed(0) + text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") + tokenizer_2 = ByT5Tokenizer() + + guider = ClassifierFreeGuidance(guidance_scale=1.0) + + components = { + "transformer": transformer.eval(), + "vae": vae.eval(), + "scheduler": scheduler, + "text_encoder": text_encoder.eval(), + "text_encoder_2": text_encoder_2.eval(), + "tokenizer": tokenizer, + "tokenizer_2": tokenizer_2, + "guider": guider, + } + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + inputs = { + "prompt": "monkey", + "generator": generator, + "num_inference_steps": 2, + "height": 16, + "width": 16, + "num_frames": 9, + "output_type": "pt", + } + return inputs + + def test_inference(self): + device = "cpu" + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + result = pipe(**inputs) + video = result.frames + + generated_video = video[0] + self.assertEqual(generated_video.shape, (9, 3, 16, 16)) + generated_slice = generated_video.flatten() + generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]]) + + # fmt: off + expected_slice = torch.tensor([0.4296, 0.5549, 0.3088, 0.9115, 0.5049, 0.7926, 0.5549, 0.8618, 0.5091, 0.5075, 0.7117, 0.5292, 0.7053, 0.4864, 0.5206, 0.3878]) + # fmt: on + + self.assertTrue( + torch.abs(generated_slice - expected_slice).max() < 1e-3, + f"output_slice: {generated_slice}, expected_slice: {expected_slice}", + ) + + @unittest.skip("TODO: Test not supported for now because needs to be adjusted to work with guiders.") + def test_encode_prompt_works_in_isolation(self): + pass + + @unittest.skip("Needs to be revisited.") + def test_inference_batch_consistent(self): + super().test_inference_batch_consistent() + + @unittest.skip("Needs to be revisited.") + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical() +