huggingface
diff --git a/‎src/transformers/exporters/exporter_dynamo.py‎
Lines changed: 12 additions & 1 deletion b/‎src/transformers/exporters/exporter_dynamo.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎src/transformers/exporters/utils.py‎
Lines changed: 69 additions & 43 deletions b/‎src/transformers/exporters/utils.py‎
Lines changed: 69 additions & 43 deletions
diff --git a/‎src/transformers/models/canine/modeling_canine.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/canine/modeling_canine.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/minicpmv4_6/modeling_minicpmv4_6.py‎
Lines changed: 33 additions & 80 deletions b/‎src/transformers/models/minicpmv4_6/modeling_minicpmv4_6.py‎
Lines changed: 33 additions & 80 deletions
@@ -154,7 +154,7 @@ def _patch_chunked_vision_attention(module):
     )
     src = inspect.getsource(module.forward) if has_attention else ""
     if has_attention and "zip(*splits)" in src:
-        returns_tuple = "return attn_output, attn_weight" in src
+        returns_tuple = "return attn_output, attn_weight" in src or "return attn_output, None" in src
         return ("forward", functools.partial(_reshaped_vision_attention_forward, module, returns_tuple=returns_tuple))
 
 
@@ -169,6 +169,13 @@ def _reshaped_vision_attention_forward(
 ):
     """Export-safe vision attention: reshape segments into batch dim, single SDPA call."""
 
+    # Normalise NaViT-style `(1, T, D)` packing (minicpmv4_6) to the flat `(T, D)` layout
+    # the rest of this wrapper assumes. The leading dim is always 1 — multi-image batches
+    # are packed along the sequence dim.
+    needs_batch_restore = hidden_states.ndim == 3
+    if needs_batch_restore:
+        hidden_states = hidden_states.squeeze(0)
+
     seq_length = hidden_states.shape[0]
     num_segments = cu_seqlens.shape[0] - 1
     torch_compilable_check(
@@ -232,6 +239,10 @@ def _to_batched(t):
     attn_output = attn_output.transpose(1, 2).reshape(seq_length, -1).contiguous()
     out_proj = self.proj if hasattr(self, "proj") else self.out_proj
     attn_output = out_proj(attn_output)
+
+    if needs_batch_restore:
+        attn_output = attn_output.unsqueeze(0)
+
     return (attn_output, None) if returns_tuple else attn_output
 
 
 
@@ -259,17 +259,19 @@ def prepare_for_export(
     "visual",
 )
 _MULTIMODAL_SUBMODULE_NAMES = _MULTIMODAL_ENCODER_NAMES + _MULTIMODAL_PROJECTOR_NAMES + _MULTIMODAL_LM_NAMES
+_WRAPPER_ATTRS = ("model", "vlm")
 
 
 def _find_multimodal_submodules(model: PreTrainedModel) -> dict[str, torch.nn.Module]:
     """Return `{attr_name: module}` for all known multi-modal submodule names found on the model.
 
-    Checks `model` first, then `model.model` (common wrapper pattern).
+    Checks `model` first, then known wrapper attributes (`model.model`, `model.vlm`, …).
     Only returns results when at least one modal encoder AND one language model are
     found — otherwise the model is not multi-modal and should be exported as a single unit.
     """
+    roots = [model] + [getattr(model, attr, None) for attr in _WRAPPER_ATTRS]
     found: dict[str, torch.nn.Module] = {}
-    for root in (model, getattr(model, "model", None)):
+    for root in roots:
         if root is None:
             continue
         for name in _MULTIMODAL_SUBMODULE_NAMES:
@@ -316,47 +318,69 @@ def _precompute_vision_inputs(model: torch.nn.Module, inputs: dict[str, Any]) ->
             position_ids, _ = model.get_rope_index(**rope_inputs)
             inputs["position_ids"] = position_ids
 
-    # Vision submodule level: precompute from grid_thw
+    modeling_module = sys.modules[type(model).__module__]
+
+    # NaViT-style packed encoders carry per-image `(h, w)` as `target_sizes` instead of `grid_thw`.
+    # Run the nearest-position-id / window-index / merged-shape helpers on the synthesised
+    # `grid_thw = [1, h, w]` so the per-image Python loops move outside the traced graph.
+    target_sizes = inputs.get("target_sizes")
+    if target_sizes is not None:
+        device = target_sizes.device
+        num_patches_per_side = _find_submodule_attr(model, "num_patches_per_side")
+        if hasattr(modeling_module, "get_vision_nearest_position_ids") and num_patches_per_side is not None:
+            inputs["position_ids"] = modeling_module.get_vision_nearest_position_ids(
+                target_sizes, num_patches_per_side
+            ).to(device)
+
+        window_kernel_size = _find_submodule_attr(model, "window_kernel_size")
+        if hasattr(modeling_module, "get_vision_window_index") and window_kernel_size is not None:
+            grid_thw = torch.nn.functional.pad(target_sizes, (1, 0), value=1)
+            window_index, cu_window_seqlens = modeling_module.get_vision_window_index(
+                grid_thw, spatial_merge_size=1, window_size=window_kernel_size[0], patch_size=1
+            )
+            inputs["window_index"] = window_index.to(device)
+            inputs["cu_window_seqlens"] = cu_window_seqlens.to(device)
+            inputs["merged_shape"] = modeling_module.get_vision_merged_shape(target_sizes, window_kernel_size)
+
+    # Vision submodule level: precompute from grid_thw. Vision config attributes can live
+    # anywhere in the submodule tree (encoder, transformer, embeddings, …) — walk to find
+    # them rather than asking models to mirror state on the outer module just so the
+    # exporter can read it.
     grid_thw = inputs.get("grid_thw")
-    if grid_thw is None:
-        return
-
-    model_mod = sys.modules[type(model).__module__]
-
-    if hasattr(model_mod, "get_vision_cu_seqlens"):
-        inputs["cu_seqlens"] = model_mod.get_vision_cu_seqlens(grid_thw)
-
-    # Vision config attributes can live anywhere in the submodule tree (encoder, transformer,
-    # embeddings, …) — walk to find them rather than asking models to mirror state on the
-    # outer module just so the exporter can read it.
-    spatial_merge_size = _find_submodule_attr(model, "spatial_merge_size")
-    if spatial_merge_size is None:
-        # Video-Llama-3 carries per-image merge sizes as an input tensor; PaddleOCR-VL has none
-        # (the encoder hard-codes `1` because spatial merging happens in the projector).
-        spatial_merge_size = inputs.get("merge_sizes", 1)
-
-    if hasattr(model_mod, "get_vision_position_ids"):
-        inputs["position_ids"] = model_mod.get_vision_position_ids(grid_thw, spatial_merge_size)
-
-    window_size = _find_submodule_attr(model, "window_size")
-    patch_size = _find_submodule_attr(model, "patch_size")
-    if hasattr(model_mod, "get_vision_window_index") and window_size is not None and patch_size is not None:
-        inputs["window_index"], inputs["cu_window_seqlens"] = model_mod.get_vision_window_index(
-            grid_thw, spatial_merge_size, window_size, patch_size
-        )
+    if grid_thw is not None:
+        spatial_merge_size = _find_submodule_attr(model, "spatial_merge_size")
+        if spatial_merge_size is None:
+            # Video-Llama-3 carries per-image merge sizes as an input tensor; PaddleOCR-VL has
+            # none (its encoder hard-codes `1` because spatial merging happens in the projector).
+            spatial_merge_size = inputs.get("merge_sizes", 1)
+
+        if hasattr(modeling_module, "get_vision_cu_seqlens"):
+            inputs["cu_seqlens"] = modeling_module.get_vision_cu_seqlens(grid_thw)
+
+        if hasattr(modeling_module, "get_vision_position_ids"):
+            inputs["position_ids"] = modeling_module.get_vision_position_ids(grid_thw, spatial_merge_size)
+
+        window_size = _find_submodule_attr(model, "window_size")
+        patch_size = _find_submodule_attr(model, "patch_size")
+        if hasattr(modeling_module, "get_vision_window_index") and window_size is not None and patch_size is not None:
+            inputs["window_index"], inputs["cu_window_seqlens"] = modeling_module.get_vision_window_index(
+                grid_thw, spatial_merge_size, window_size, patch_size
+            )
 
-    num_grid_per_side = _find_submodule_attr(model, "num_grid_per_side")
-    if hasattr(model_mod, "get_vision_bilinear_indices_and_weights") and num_grid_per_side is not None:
-        inputs["bilinear_indices"], inputs["bilinear_weights"] = model_mod.get_vision_bilinear_indices_and_weights(
-            grid_thw, num_grid_per_side, spatial_merge_size
-        )
+        num_grid_per_side = _find_submodule_attr(model, "num_grid_per_side")
+        if hasattr(modeling_module, "get_vision_bilinear_indices_and_weights") and num_grid_per_side is not None:
+            inputs["bilinear_indices"], inputs["bilinear_weights"] = (
+                modeling_module.get_vision_bilinear_indices_and_weights(
+                    grid_thw, num_grid_per_side, spatial_merge_size
+                )
+            )
 
 
 def _precompute_audio_inputs(model: torch.nn.Module, inputs: dict[str, Any]) -> None:
     """Precompute audio encoder inputs that use untraceable ops (.tolist(), nonzero(), loops)."""
-    model_mod = sys.modules[type(model).__module__]
+    modeling_module = sys.modules[type(model).__module__]
 
-    if not hasattr(model_mod, "chunk_and_pad_features"):
+    if not hasattr(modeling_module, "chunk_and_pad_features"):
         return
 
     if "input_features" not in inputs or "feature_lens" not in inputs:
@@ -365,23 +389,25 @@ def _precompute_audio_inputs(model: torch.nn.Module, inputs: dict[str, Any]) ->
     feature_lens = inputs.pop("feature_lens")
     input_features = inputs.pop("input_features")
 
-    padded_feature, chunk_lengths = model_mod.chunk_and_pad_features(input_features, feature_lens, model.n_window)
+    padded_feature, chunk_lengths = modeling_module.chunk_and_pad_features(
+        input_features, feature_lens, model.n_window
+    )
     inputs["padded_feature"] = padded_feature
     inputs["chunk_lengths"] = chunk_lengths
 
-    if hasattr(model_mod, "get_audio_cu_seqlens"):
-        fn = model_mod.get_audio_cu_seqlens
+    if hasattr(modeling_module, "get_audio_cu_seqlens"):
+        fn = modeling_module.get_audio_cu_seqlens
         fn_params = set(inspect.signature(fn).parameters)
         if "feature_lens" in fn_params:
             inputs["cu_seqlens"] = fn(chunk_lengths, feature_lens, model.n_window_infer, model.n_window)
         else:
             inputs["cu_seqlens"] = fn(chunk_lengths)
 
-    if hasattr(model_mod, "get_valid_indices"):
-        inputs["valid_indices"] = model_mod.get_valid_indices(chunk_lengths)
+    if hasattr(modeling_module, "get_valid_indices"):
+        inputs["valid_indices"] = modeling_module.get_valid_indices(chunk_lengths)
 
-    if hasattr(model_mod, "get_pool_indices"):
-        inputs["pool_indices"] = model_mod.get_pool_indices(feature_lens)
+    if hasattr(modeling_module, "get_pool_indices"):
+        inputs["pool_indices"] = modeling_module.get_pool_indices(feature_lens)
 
 
 @contextlib.contextmanager
 
@@ -916,7 +916,7 @@ def forward(
         molecule_attention_mask = create_bidirectional_mask(
             config=self.config,
             inputs_embeds=init_molecule_encoding[:, 0:1, :],  # force q_len == 1
-            attention_mask=molecule_attention_mask.squeeze(1),  # 3D mask at times due to custom fn
+            attention_mask=molecule_attention_mask,
         )
 
         # Deep BERT encoder
 
@@ -42,6 +42,7 @@
 from ...utils.generic import can_return_tuple, is_flash_attention_requested, merge_with_config_defaults
 from ...utils.import_utils import torch_compilable_check
 from ...utils.output_capturing import capture_outputs
+from ...vision_utils import get_vision_merged_shape, get_vision_nearest_position_ids, get_vision_window_index
 from ..auto import AutoModel
 from .configuration_minicpmv4_6 import MiniCPMV4_6Config, MiniCPMV4_6VisionConfig
 
@@ -80,32 +81,14 @@ def forward(
         self,
         pixel_values: torch.FloatTensor,
         target_sizes: torch.IntTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
         patch_embeds = self.patch_embedding(pixel_values)
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
 
-        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
-
-        position_embeddings = []
-        for target_size in target_sizes:
-            nb_patches_h = target_size[0]
-            nb_patches_w = target_size[1]
-
-            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
-            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
-
-            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
-            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
-
-            pos_ids = (
-                (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w)
-                .flatten()
-                .to(self.position_embedding.weight.device)
-            )
-
-            position_embeddings.append(self.position_embedding(pos_ids))
-
-        position_embeddings = torch.concat(position_embeddings, dim=0).unsqueeze(0)
+        pos_ids = get_vision_nearest_position_ids(target_sizes, self.num_patches_per_side, kwargs=kwargs)
+        pos_ids = pos_ids.to(self.position_embedding.weight.device)
+        position_embeddings = self.position_embedding(pos_ids).unsqueeze(0)
         embeddings = embeddings + position_embeddings
         return embeddings
 
@@ -358,55 +341,27 @@ def _init_weights(self):
         init.normal_(self.linear_2.weight, std=0.25)
         init.normal_(self.linear_2.bias, std=1e-6)
 
-    def get_window_index(self, target_sizes):
+    def get_window_index(self, target_sizes, kwargs=None):
         window_h, window_w = self.window_kernel_size
-        max_seqlens = window_h * window_w
-
-        window_index_list = []
-        cu_seqlens = [0]
-        token_offset = 0
-
-        for height, width in target_sizes:
-            # Cast 0-d device tensors to Python ints so that the whole function
-            # stays CPU-side integer arithmetic. `torch.arange` without `device=`
-            # always returns on CPU; mixing with a device-bound `token_offset`
-            # raises in strict PyTorch versions (2.10+).
-            height, width = int(height), int(width)
-            if height % window_h != 0 or width % window_w != 0:
-                raise ValueError(
-                    f"height={height}, width={width} must be divisible by window size ({window_h}, {window_w})"
-                )
-            index = torch.arange(height * width).reshape(height, width)
-            num_windows_h = height // window_h
-            num_windows_w = width // window_w
-            num_windows = num_windows_h * num_windows_w
-
-            index = index.reshape(num_windows_h, window_h, num_windows_w, window_w)
-            index = index.permute(0, 2, 1, 3).reshape(num_windows, window_h * window_w)
-
-            window_index_list.append(index.reshape(-1) + token_offset)
-
-            cu_this = torch.arange(1, num_windows + 1) * (window_h * window_w) + cu_seqlens[-1]
-            cu_seqlens.extend(cu_this.tolist())
-
-            token_offset += height * width
-
-        window_index = torch.cat(window_index_list)
-        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32)
-
-        return window_index, cu_seqlens, max_seqlens
+        if window_h != window_w:
+            raise ValueError(f"window_kernel_size must be square; got ({window_h}, {window_w})")
+        grid_thw = F.pad(target_sizes, (1, 0), value=1)
+        window_index, cu_seqlens = get_vision_window_index(
+            grid_thw, spatial_merge_size=1, window_size=window_h, patch_size=1, kwargs=kwargs
+        )
+        return window_index, cu_seqlens, window_h * window_w
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         target_sizes: torch.IntTensor,
-        cu_seqlens: torch.Tensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
     ):
         residual = hidden_states
         hidden_states = self.layer_norm1(hidden_states)
         device = hidden_states.device
 
-        window_index, window_cu_seqlens, window_max_seqlens = self.get_window_index(target_sizes)
+        window_index, window_cu_seqlens, window_max_seqlens = self.get_window_index(target_sizes, kwargs=kwargs)
         window_index = window_index.to(device)
 
         hidden_states = hidden_states[:, window_index, :]
@@ -418,28 +373,26 @@ def forward(
         hidden_states = hidden_states[:, torch.argsort(window_index), :]
         hidden_states = residual + hidden_states
 
-        batch_size, _ = target_sizes.shape
+        # Vectorised window merge: reshape (1, batch*seq_per_img, D) → (batch, seq_per_img, D)
+        # and lift per-image (h, w) from target_sizes[0]. This assumes the input batch was
+        # packed with uniform per-image sizes (the standard NaViT preprocessing output).
+        batch_size = target_sizes.shape[0]
         window_h, window_w = self.window_kernel_size
-        all_pixel_values = []
-        for batch_idx in range(batch_size):
-            height, width = target_sizes[batch_idx]
-            patch = hidden_states[0, cu_seqlens[batch_idx] : cu_seqlens[batch_idx + 1], :].squeeze(0)
-
-            embed_dim = patch.shape[-1]
-            merged_h, merged_w = height // window_h, width // window_w
-            patch_5d = patch.view(merged_h, window_h, merged_w, window_w, embed_dim).permute(0, 2, 1, 3, 4)
-            hidden_state = patch_5d.reshape(merged_h * merged_w, window_h * window_w * embed_dim)
-            residual = patch_5d.reshape(merged_h * merged_w, window_h * window_w, embed_dim).mean(dim=1)
+        embed_dim = hidden_states.shape[-1]
+        seq_per_img = hidden_states.shape[1] // batch_size
+        patch = hidden_states.view(batch_size, seq_per_img, embed_dim)
+        merged_h, merged_w = get_vision_merged_shape(target_sizes, self.window_kernel_size, kwargs=kwargs)
 
-            hidden_state = self.pre_norm(hidden_state)
-            hidden_state = self.linear_1(hidden_state)
-            hidden_state = self.act(hidden_state)
-            hidden_state = self.linear_2(hidden_state)
+        patch_5d = patch.view(batch_size, merged_h, window_h, merged_w, window_w, embed_dim).permute(0, 1, 3, 2, 4, 5)
+        flat = patch_5d.reshape(batch_size * merged_h * merged_w, window_h * window_w * embed_dim)
+        residual = patch_5d.reshape(batch_size * merged_h * merged_w, window_h * window_w, embed_dim).mean(dim=1)
 
-            all_pixel_values.append(hidden_state + residual)
+        hidden_state = self.pre_norm(flat)
+        hidden_state = self.linear_1(hidden_state)
+        hidden_state = self.act(hidden_state)
+        hidden_state = self.linear_2(hidden_state)
 
-        new_hidden_states = torch.concat(all_pixel_values, dim=0).unsqueeze(0)
-        return new_hidden_states
+        return (hidden_state + residual).unsqueeze(0)
 
 
 class MiniCPMV4_6VisionPreTrainedModel(PreTrainedModel):
@@ -503,7 +456,7 @@ def forward(
             Whether to apply the ViT window-attention merger after the encoder.
         """
 
-        hidden_states = self.embeddings(pixel_values, target_sizes=target_sizes)
+        hidden_states = self.embeddings(pixel_values, target_sizes=target_sizes, **kwargs)
 
         cu_seqlens = F.pad(
             torch.cumsum(target_sizes[:, 0] * target_sizes[:, 1], dim=0, dtype=torch.int32).to(hidden_states.device),
@@ -523,7 +476,7 @@ def forward(
             for layer_index, encoder_layer in enumerate(self.encoder.layers):
                 hidden_states = encoder_layer(hidden_states, **attn_kwargs)
                 if layer_index == insert_layer_id:
-                    hidden_states = self.vit_merger(hidden_states, target_sizes, cu_seqlens)
+                    hidden_states = self.vit_merger(hidden_states, target_sizes, **kwargs)
 
                     # NOTE: Downsampled hidden states, and therefore other kwargs should also!
                     attn_kwargs, target_sizes, cu_seqlens = self.get_downsampled_inputs(
Original file line number	Diff line number	Diff line change
`@@ -916,7 +916,7 @@ def forward(`
`916`	`916`	`molecule_attention_mask = create_bidirectional_mask(`
`917`	`917`	`config=self.config,`
`918`	`918`	`inputs_embeds=init_molecule_encoding[:, 0:1, :], # force q_len == 1`
`919`		`- attention_mask=molecule_attention_mask.squeeze(1), # 3D mask at times due to custom fn`
	`919`	`+ attention_mask=molecule_attention_mask,`
`920`	`920`	`)`
`921`	`921`
`922`	`922`	`# Deep BERT encoder`