From 16c7020ae81f31796309d7d3512009c8695783b3 Mon Sep 17 00:00:00 2001 From: Tai An Date: Wed, 29 Apr 2026 00:46:45 -0700 Subject: [PATCH 1/2] fix(z_image): pass raw caption length to _pad_with_ids --- src/diffusers/models/transformers/transformer_z_image.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_z_image.py b/src/diffusers/models/transformers/transformer_z_image.py index ba401e7fdef1..75951285ff89 100644 --- a/src/diffusers/models/transformers/transformer_z_image.py +++ b/src/diffusers/models/transformers/transformer_z_image.py @@ -596,7 +596,7 @@ def patchify_and_embed( for image, cap_feat in zip(all_image, all_cap_feats): # Caption cap_out, cap_pos_ids, cap_pad_mask, cap_len, _ = self._pad_with_ids( - cap_feat, (len(cap_feat) + (-len(cap_feat)) % SEQ_MULTI_OF, 1, 1), (1, 0, 0), device + cap_feat, (len(cap_feat), 1, 1), (1, 0, 0), device ) all_cap_out.append(cap_out) all_cap_pos_ids.append(cap_pos_ids) @@ -651,7 +651,7 @@ def patchify_and_embed_omni( noise_val = images_noise_mask[i][j] if j < len(images_noise_mask[i]) else 1 cap_out, cap_pos, cap_mask, cap_len, cap_nm = self._pad_with_ids( cap_item, - (len(cap_item) + (-len(cap_item)) % SEQ_MULTI_OF, 1, 1), + (len(cap_item), 1, 1), (cap_cu_len, 0, 0), device, noise_val, From d9fbc95f9b824765fe98b31c574a0071cdee412d Mon Sep 17 00:00:00 2001 From: Tai An Date: Wed, 29 Apr 2026 00:46:54 -0700 Subject: [PATCH 2/2] fix(z_image controlnet): pass raw caption length to _pad_with_ids --- src/diffusers/models/controlnets/controlnet_z_image.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/controlnets/controlnet_z_image.py b/src/diffusers/models/controlnets/controlnet_z_image.py index 85fa0d365547..bfd3d0c8db87 100644 --- a/src/diffusers/models/controlnets/controlnet_z_image.py +++ b/src/diffusers/models/controlnets/controlnet_z_image.py @@ -597,7 +597,7 @@ def patchify_and_embed( for image, cap_feat in zip(all_image, all_cap_feats): # Caption cap_out, cap_pos_ids, cap_pad_mask, cap_len, _ = self._pad_with_ids( - cap_feat, (len(cap_feat) + (-len(cap_feat)) % SEQ_MULTI_OF, 1, 1), (1, 0, 0), device + cap_feat, (len(cap_feat), 1, 1), (1, 0, 0), device ) all_cap_out.append(cap_out) all_cap_pos_ids.append(cap_pos_ids)