diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index f3c9f0e1d58f..6d8ad757b968 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -1694,7 +1694,8 @@ def _deepstack_process(self, hidden_states, visual_pos_masks, visual_embeds): visual_pos_masks = visual_pos_masks[..., 0] visual_pos_masks = visual_pos_masks.to(hidden_states.device) visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype) - local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds + hidden_states = hidden_states.clone() + local_this = hidden_states[visual_pos_masks, :] + visual_embeds hidden_states[visual_pos_masks, :] = local_this return hidden_states @@ -2888,7 +2889,8 @@ def _deepstack_process( ): visual_pos_masks = visual_pos_masks.to(hidden_states.device) visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype) - local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds + hidden_states = hidden_states.clone() + local_this = hidden_states[visual_pos_masks, :] + visual_embeds hidden_states[visual_pos_masks, :] = local_this return hidden_states diff --git a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 545d499772f7..bba6c6cd37e3 100644 --- a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -876,7 +876,8 @@ def _deepstack_process( ): visual_pos_masks = visual_pos_masks.to(hidden_states.device) visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype) - local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds + hidden_states = hidden_states.clone() + local_this = hidden_states[visual_pos_masks, :] + visual_embeds hidden_states[visual_pos_masks, :] = local_this return hidden_states diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 3af6a3396435..9875aca00699 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -746,7 +746,8 @@ def _deepstack_process( ): visual_pos_masks = visual_pos_masks.to(hidden_states.device) visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype) - local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds + hidden_states = hidden_states.clone() + local_this = hidden_states[visual_pos_masks, :] + visual_embeds hidden_states[visual_pos_masks, :] = local_this return hidden_states diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 109eb02d67c4..7bfbdb5dfb4d 100644 --- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -957,7 +957,8 @@ def _deepstack_process( ): visual_pos_masks = visual_pos_masks.to(hidden_states.device) visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype) - local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds + hidden_states = hidden_states.clone() + local_this = hidden_states[visual_pos_masks, :] + visual_embeds hidden_states[visual_pos_masks, :] = local_this return hidden_states