From 3ba827bac157494be3342e417fe3f4715c5641d6 Mon Sep 17 00:00:00 2001 From: Stefanos Ginargiros Date: Wed, 24 Dec 2025 13:57:06 +0000 Subject: [PATCH 1/7] qwen3-vl-processor-videos-arg-correction --- src/transformers/models/qwen3_vl/modular_qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 1fb2a6993b10..d47a73ff0309 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -1418,7 +1418,7 @@ def __call__( (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch + The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: From cbe9025d3f8c61f633394ee95a3e5974413ae060 Mon Sep 17 00:00:00 2001 From: Stefanos Ginargiros Date: Wed, 24 Dec 2025 13:58:14 +0000 Subject: [PATCH 2/7] minor sequence typo in processor __call__() --- src/transformers/models/qwen3_vl/modular_qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index d47a73ff0309..df1ea6db0ebf 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -1404,7 +1404,7 @@ def __call__( **kwargs: Unpack[Qwen3VLProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + Main method to prepare for the model one or several sequence(s) and image(s). This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`. From de659fb5eafbc358ba41986082ff646816242be1 Mon Sep 17 00:00:00 2001 From: Stefanos Ginargiros Date: Wed, 24 Dec 2025 13:59:59 +0000 Subject: [PATCH 3/7] minor kwarg typo in processor __call__() --- src/transformers/models/qwen3_vl/modular_qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index df1ea6db0ebf..3663a8108981 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -1406,7 +1406,7 @@ def __call__( """ Main method to prepare for the model one or several sequence(s) and image(s). This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to + the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`. Args: From d8507a280c0674ed850340deab8fd67936c3b27d Mon Sep 17 00:00:00 2001 From: Stefanos Ginargiros Date: Wed, 24 Dec 2025 14:04:03 +0000 Subject: [PATCH 4/7] attention bias duplicate default hinting --- src/transformers/models/qwen3_vl/modular_qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 3663a8108981..50d8262c7196 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -152,7 +152,7 @@ class Qwen3VLTextConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. From 7ee60768569e29f55b94ed610d44d454473e86f6 Mon Sep 17 00:00:00 2001 From: Stefanos Ginargiros Date: Wed, 24 Dec 2025 16:20:07 +0200 Subject: [PATCH 5/7] image, video, vision token ids - clarification --- src/transformers/models/qwen3_vl/modular_qwen3_vl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 50d8262c7196..31bbf99ef29a 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -239,13 +239,13 @@ class Qwen3VLConfig(PreTrainedConfig): vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`): The config object or dictionary of the vision backbone. image_token_id (`int`, *optional*, defaults to 151655): - The image token index to encode the image prompt. + The token id used as the placeholder for image inputs. video_token_id (`int`, *optional*, defaults to 151656): - The video token index to encode the image prompt. + The token id used as the placeholder for video inputs. vision_start_token_id (`int`, *optional*, defaults to 151652): - The start token index to encode the image prompt. + The token id that marks the start of a vision segment (image or video). vision_end_token_id (`int`, *optional*, defaults to 151653): - The end token index to encode the image prompt. + The token id that marks the end of a vision segment (image or video). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie the word embeddings. From 954f1bea2de5c046e1d487d48a3b5db82d8a5f14 Mon Sep 17 00:00:00 2001 From: Stefanos Ginargiros Date: Wed, 24 Dec 2025 17:11:22 +0200 Subject: [PATCH 6/7] qwen3vl configuration file update --- .../models/qwen3_vl/configuration_qwen3_vl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index cf6f17364672..47b2a8245e01 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -110,7 +110,7 @@ class Qwen3VLTextConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. @@ -197,13 +197,13 @@ class Qwen3VLConfig(PreTrainedConfig): vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`): The config object or dictionary of the vision backbone. image_token_id (`int`, *optional*, defaults to 151655): - The image token index to encode the image prompt. + The token id used as the placeholder for image inputs. video_token_id (`int`, *optional*, defaults to 151656): - The video token index to encode the image prompt. + The token id used as the placeholder for video inputs. vision_start_token_id (`int`, *optional*, defaults to 151652): - The start token index to encode the image prompt. + The token id that marks the start of a vision segment (image or video). vision_end_token_id (`int`, *optional*, defaults to 151653): - The end token index to encode the image prompt. + The token id that marks the end of a vision segment (image or video). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie the word embeddings. From 90c5c93618c47ea43a9a3b8e6f6aa77b5bd1412c Mon Sep 17 00:00:00 2001 From: Stefanos Ginargiros Date: Wed, 24 Dec 2025 17:14:02 +0200 Subject: [PATCH 7/7] qwen3vl processing file update --- src/transformers/models/qwen3_vl/processing_qwen3_vl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py index 5137d916d810..14fdab3e8008 100644 --- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py @@ -99,9 +99,9 @@ def __call__( **kwargs: Unpack[Qwen3VLProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + Main method to prepare for the model one or several sequence(s) and image(s). This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to + the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`. Args: @@ -113,7 +113,7 @@ def __call__( (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch + The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: