Fix neg embed and devide / bug; Reuse pad zero tensor; Turn cat -> repeat; Add hint for attn processor.

JerryWu-code · JerryWu-code · commit 2bb39f46cdfc · 2025-11-24T18:26:28.000Z
diff --git a/src/diffusers/models/transformers/transformer_z_image.py b/src/diffusers/models/transformers/transformer_z_image.py
@@ -90,6 +90,12 @@ class ZSingleStreamAttnProcessor:
     _attention_backend = None
     _parallel_config = None
 
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
+            )
+
     def __call__(
         self,
         attn: Attention,
@@ -493,7 +499,6 @@ def patchify_and_embed(
 
             image_ori_len = len(image)
             image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
-            # padded_pos_ids
 
             image_ori_pos_ids = self.create_coordinate_grid(
                 size=(F_tokens, H_tokens, W_tokens),
@@ -574,11 +579,7 @@ def forward(
         x = list(x.split(x_item_seqlens, dim=0))
         x_freqs_cis = list(self.rope_embedder(torch.cat(x_pos_ids, dim=0)).split(x_item_seqlens, dim=0))
 
-        pad_tensor = torch.zeros(
-            (1, self.dim),
-            dtype=x[0].dtype,
-            device=device,
-        )
+        pad_tensor = torch.zeros((1, self.dim), dtype=x[0].dtype, device=device)
         freqs_pad_tensor = torch.zeros(
             (1, self.dim // self.n_heads // 2),
             dtype=x_freqs_cis[0].dtype,
@@ -613,22 +614,19 @@ def forward(
         cap_feats = list(cap_feats.split(cap_item_seqlens, dim=0))
         cap_freqs_cis = list(self.rope_embedder(torch.cat(cap_pos_ids, dim=0)).split(cap_item_seqlens, dim=0))
 
-        pad_tensor = torch.zeros(
-            (1, self.dim),
-            dtype=cap_feats[0].dtype,
-            device=device,
-        )
-        freqs_pad_tensor = torch.zeros(
-            (1, self.dim // self.n_heads // 2),
-            dtype=cap_freqs_cis[0].dtype,
-            device=device,
+        # Reuse padding tensors (convert dtype if needed)
+        cap_pad_tensor = pad_tensor.to(cap_feats[0].dtype) if pad_tensor.dtype != cap_feats[0].dtype else pad_tensor
+        cap_freqs_pad_tensor = (
+            freqs_pad_tensor.to(cap_freqs_cis[0].dtype)
+            if freqs_pad_tensor.dtype != cap_freqs_cis[0].dtype
+            else freqs_pad_tensor
         )
         cap_attn_mask = torch.ones((bsz, cap_max_item_seqlen), dtype=torch.bool, device=device)
         for i, (item, freqs_item) in enumerate(zip(cap_feats, cap_freqs_cis)):
             seq_len = cap_item_seqlens[i]
             pad_len = cap_max_item_seqlen - seq_len
-            cap_feats[i] = torch.cat([item, pad_tensor.repeat(pad_len, 1)])
-            cap_freqs_cis[i] = torch.cat([freqs_item, freqs_pad_tensor.repeat(pad_len, 1)])
+            cap_feats[i] = torch.cat([item, cap_pad_tensor.repeat(pad_len, 1)])
+            cap_freqs_cis[i] = torch.cat([freqs_item, cap_freqs_pad_tensor.repeat(pad_len, 1)])
             cap_attn_mask[i, seq_len:] = 0
         cap_feats = torch.stack(cap_feats)
         cap_freqs_cis = torch.stack(cap_freqs_cis)
@@ -652,22 +650,18 @@ def forward(
         assert unified_item_seqlens == [len(_) for _ in unified]
         unified_max_item_seqlen = max(unified_item_seqlens)
 
-        pad_tensor = torch.zeros(
-            (1, self.dim),
-            dtype=unified[0].dtype,
-            device=device,
-        )
-        freqs_pad_tensor = torch.zeros(
-            (1, self.dim // self.n_heads // 2),
-            dtype=unified_freqs_cis[0].dtype,
-            device=device,
+        unified_pad_tensor = pad_tensor.to(unified[0].dtype) if pad_tensor.dtype != unified[0].dtype else pad_tensor
+        unified_freqs_pad_tensor = (
+            freqs_pad_tensor.to(unified_freqs_cis[0].dtype)
+            if freqs_pad_tensor.dtype != unified_freqs_cis[0].dtype
+            else freqs_pad_tensor
         )
         unified_attn_mask = torch.ones((bsz, unified_max_item_seqlen), dtype=torch.bool, device=device)
         for i, (item, freqs_item) in enumerate(zip(unified, unified_freqs_cis)):
             seq_len = unified_item_seqlens[i]
             pad_len = unified_max_item_seqlen - seq_len
-            unified[i] = torch.cat([item, pad_tensor.repeat(pad_len, 1)])
-            unified_freqs_cis[i] = torch.cat([freqs_item, freqs_pad_tensor.repeat(pad_len, 1)])
+            unified[i] = torch.cat([item, unified_pad_tensor.repeat(pad_len, 1)])
+            unified_freqs_cis[i] = torch.cat([freqs_item, unified_freqs_pad_tensor.repeat(pad_len, 1)])
             unified_attn_mask[i, seq_len:] = 0
         unified = torch.stack(unified)
         unified_freqs_cis = torch.stack(unified_freqs_cis)
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image.py b/src/diffusers/pipelines/z_image/pipeline_z_image.py
@@ -193,6 +193,8 @@ def encode_prompt(
                 prompt_embeds=negative_prompt_embeds,
                 max_sequence_length=max_sequence_length,
             )
+        else:
+            negative_prompt_embeds = []
         return prompt_embeds, negative_prompt_embeds
 
     def _encode_prompt(
@@ -398,6 +400,18 @@ def __call__(
         height = height or 1024
         width = width or 1024
 
+        vae_scale = self.vae_scale_factor * 2
+        if height % vae_scale != 0:
+            raise ValueError(
+                f"Height must be divisible by {vae_scale} (got {height}). "
+                f"Please adjust the height to a multiple of {vae_scale}."
+            )
+        if width % vae_scale != 0:
+            raise ValueError(
+                f"Width must be divisible by {vae_scale} (got {width}). "
+                f"Please adjust the width to a multiple of {vae_scale}."
+            )
+
         assert self.dtype == torch.bfloat16
         dtype = self.dtype
         device = self._execution_device
@@ -447,7 +461,7 @@ def __call__(
             generator,
             latents,
         )
-        image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] / 2)
+        image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
 
         # 5. Prepare timesteps
         mu = calculate_shift(
@@ -495,12 +509,12 @@ def __call__(
                 apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
 
                 if apply_cfg:
-                    # Prepare inputs for CFG
-                    latent_model_input = torch.cat([latents.to(dtype)] * 2)
+                    latents_typed = latents if latents.dtype == dtype else latents.to(dtype)
+                    latent_model_input = latents_typed.repeat(2, 1, 1, 1)
                     prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
-                    timestep_model_input = torch.cat([timestep] * 2)
+                    timestep_model_input = timestep.repeat(2)
                 else:
-                    latent_model_input = latents.to(dtype)
+                    latent_model_input = latents if latents.dtype == dtype else latents.to(dtype)
                     prompt_embeds_model_input = prompt_embeds
                     timestep_model_input = timestep