Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion MIGRATION_GUIDE_V5.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained(
- Methods to init a nested config such as `from_xxx_config` are deleted. Configs can be init from the `__init__` method in the same way. See [#41314](https://github.com/huggingface/transformers/pull/41314).
- It is no longer possible to load a config class from a URL file. Configs must be loaded from either a local path or a repo on the Hub. See [#42383](https://github.com/huggingface/transformers/pull/42383).
- All parameters for configuring model's rotary embedding are now stored under `mode.rope_parameters`, including the `rope_theta` and `rope_type`. Model's `config.rope_parameters` is a simple dictionaty in most cases, and can also be a nested dict in special cases (i.e. Gemma3 and ModernBert) with different rope parameterization for each layer type. See [#39847](https://github.com/huggingface/transformers/pull/39847)

- Qwen-VL family configuration is in a nested format and trying to access keys directly will throw an error (e.g. `config.vocab_size`). Users are expected to access keys from their respective sub-configs (`config.text_config.vocab_size`).

## Processing

Expand Down
60 changes: 27 additions & 33 deletions src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
from typing import Optional

from ...configuration_utils import PreTrainedConfig, layer_type_validation
Expand Down Expand Up @@ -127,6 +128,12 @@ class Qwen2_5_VLTextConfig(PreTrainedConfig):
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
with longer `max_position_embeddings`.
bos_token_id (`int`, *optional*, defaults to 151643):
The id of the _beginning-of-stream_ token.
eos_token_id (`int`, *optional*, defaults to 151645):
The id of the _end-of-stream_ token.
pad_token_id (`int`, *optional*):
The id of the _padding_ token.

```python
>>> from transformers import Qwen2_5_VLTextModel, Qwen2_5_VLConfig
Expand Down Expand Up @@ -180,6 +187,9 @@ def __init__(
layer_types: Optional[list[str]] = None,
attention_dropout: Optional[float] = 0.0,
rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
bos_token_id: Optional[int] = 151643,
eos_token_id: Optional[int] = 151645,
pad_token_id: Optional[int] = None,
**kwargs,
):
self.vocab_size = vocab_size
Expand Down Expand Up @@ -222,7 +232,14 @@ def __init__(
if self.rope_parameters["rope_type"] == "mrope":
self.rope_parameters["rope_type"] = "default"
rope_config_validation(self, ignore_keys={"mrope_section"})
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

super().__init__(
tie_word_embeddings=tie_word_embeddings,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
**kwargs,
)


class Qwen2_5_VLConfig(PreTrainedConfig):
Expand Down Expand Up @@ -277,11 +294,6 @@ def __init__(
vision_end_token_id=151653,
**kwargs,
):
# We need to init super() here so that it does not reset values
# that are in text config to the BaseClass defaults. The Base
# config has many text related defaults and not all defaults are same as for `Qwen2_5_VLTextConfig`
super().__init__(**kwargs)

if isinstance(vision_config, dict):
self.vision_config = self.sub_configs["vision_config"](**vision_config)
elif vision_config is None:
Expand All @@ -290,39 +302,21 @@ def __init__(
if isinstance(text_config, dict):
self.text_config = self.sub_configs["text_config"](**text_config)
elif text_config is None:
# For BC use all kwargs to init `TextConfig`
self.text_config = self.sub_configs["text_config"](**kwargs)
# Hub configs are saved as flat dicts so we pop some of kwargs to init `TextConfig`
text_params = inspect.signature(self.sub_configs["text_config"].__init__).parameters.keys()
text_params = list(text_params) + ["rope_scaling", "rope_theta"]
text_config = {key: kwargs.pop(key) for key in text_params if key in kwargs}
text_config["dtype"] = kwargs.get("torch_dtype", kwargs.get("dtype")) # don't pop the dtype
Comment on lines +305 to +309
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not the best way, i should admit. The problem is that config will assign all kwargs as attributes, and if we don't pop we end up with the same set of kwargs in text config and in the general config

self.text_config = self.sub_configs["text_config"](**text_config)

self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.vision_start_token_id = vision_start_token_id
self.vision_end_token_id = vision_end_token_id

# Attention implementation to use. It sets it recursively on sub-configs so we call it again in the end
self._attn_implementation = kwargs.pop("attn_implementation", None)

def __setattr__(self, key, value):
if (
(text_config := super().__getattribute__("__dict__").get("text_config")) is not None
and key not in ["_name_or_path", "model_type", "dtype", "_attn_implementation_internal"]
and key in text_config.__dict__
):
setattr(text_config, key, value)
else:
super().__setattr__(key, value)

def __getattribute__(self, key):
if "text_config" in super().__getattribute__("__dict__") and key not in [
"_name_or_path",
"model_type",
"dtype",
"_attn_implementation_internal",
]:
text_config = super().__getattribute__("text_config")
if key in text_config.__dict__:
return getattr(text_config, key)

return super().__getattribute__(key)
# FIXME: arthur/cyril - tying has to be used from the text config
kwargs["tie_word_embeddings"] = self.text_config.tie_word_embeddings
super().__init__(**kwargs)


__all__ = ["Qwen2_5_VLConfig", "Qwen2_5_VLTextConfig"]
60 changes: 27 additions & 33 deletions src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
"""Qwen2VL model configuration"""

import inspect
from typing import Optional

from ...configuration_utils import PreTrainedConfig, layer_type_validation
Expand Down Expand Up @@ -115,6 +116,12 @@ class Qwen2VLTextConfig(PreTrainedConfig):
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
with longer `max_position_embeddings`.
bos_token_id (`int`, *optional*, defaults to 151643):
The id of the _beginning-of-stream_ token.
eos_token_id (`int`, *optional*, defaults to 151645):
The id of the _end-of-stream_ token.
pad_token_id (`int`, *optional*):
The id of the _padding_ token.

```python
>>> from transformers import Qwen2VLTextModel, Qwen2VLConfig
Expand Down Expand Up @@ -168,6 +175,9 @@ def __init__(
layer_types: Optional[list[str]] = None,
attention_dropout: Optional[float] = 0.0,
rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
bos_token_id: Optional[int] = 151643,
eos_token_id: Optional[int] = 151645,
pad_token_id: Optional[int] = None,
**kwargs,
):
self.vocab_size = vocab_size
Expand Down Expand Up @@ -210,7 +220,14 @@ def __init__(
if self.rope_parameters["rope_type"] == "mrope":
self.rope_parameters["rope_type"] = "default"
rope_config_validation(self, ignore_keys={"mrope_section"})
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

super().__init__(
tie_word_embeddings=tie_word_embeddings,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
**kwargs,
)


class Qwen2VLConfig(PreTrainedConfig):
Expand Down Expand Up @@ -265,11 +282,6 @@ def __init__(
vision_end_token_id=151653,
**kwargs,
):
# We need to init super() here so that it does not reset values
# that are in text config to the BaseClass defaults. The Base
# config has many text related defaults and not all defaults are same as for `Qwen2VLTextConfig`
super().__init__(**kwargs)

if isinstance(vision_config, dict):
self.vision_config = self.sub_configs["vision_config"](**vision_config)
elif vision_config is None:
Expand All @@ -278,39 +290,21 @@ def __init__(
if isinstance(text_config, dict):
self.text_config = self.sub_configs["text_config"](**text_config)
elif text_config is None:
# For BC use all kwargs to init `TextConfig`
self.text_config = self.sub_configs["text_config"](**kwargs)
# Hub configs are saved as flat dicts so we pop some of kwargs to init `TextConfig`
text_params = inspect.signature(self.sub_configs["text_config"].__init__).parameters.keys()
text_params = list(text_params) + ["rope_scaling", "rope_theta"]
text_config = {key: kwargs.pop(key) for key in text_params if key in kwargs}
text_config["dtype"] = kwargs.get("torch_dtype", kwargs.get("dtype")) # don't pop the dtype
self.text_config = self.sub_configs["text_config"](**text_config)

self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.vision_start_token_id = vision_start_token_id
self.vision_end_token_id = vision_end_token_id

# Attention implementation to use. It sets it recursively on sub-configs so we call it again in the end
self._attn_implementation = kwargs.pop("attn_implementation", None)

def __setattr__(self, key, value):
if (
(text_config := super().__getattribute__("__dict__").get("text_config")) is not None
and key not in ["_name_or_path", "model_type", "dtype", "_attn_implementation_internal"]
and key in text_config.__dict__
):
setattr(text_config, key, value)
else:
super().__setattr__(key, value)

def __getattribute__(self, key):
if "text_config" in super().__getattribute__("__dict__") and key not in [
"_name_or_path",
"model_type",
"dtype",
"_attn_implementation_internal",
]:
text_config = super().__getattribute__("text_config")
if key in text_config.__dict__:
return getattr(text_config, key)

return super().__getattribute__(key)
# FIXME: arthur/cyril - tying has to be used from the text config
kwargs["tie_word_embeddings"] = self.text_config.tie_word_embeddings
super().__init__(**kwargs)


__all__ = ["Qwen2VLConfig", "Qwen2VLTextConfig"]
38 changes: 0 additions & 38 deletions tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,44 +209,6 @@ def setUp(self):
def test_config(self):
self.config_tester.run_common_tests()

def test_text_config(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
base_config_dict = config.to_dict()
base_config = Qwen2_5_VLConfig(**base_config_dict)

# Trying to get or set text related attributes happens via text config
vocab_size = base_config.vocab_size
text_vocab_size = base_config.text_config.vocab_size
self.assertEqual(vocab_size, text_vocab_size)

base_config.vocab_size = 55
self.assertEqual(base_config.vocab_size, 55)
self.assertEqual(base_config.text_config.vocab_size, 55)

# We can still initialize config from old-format json, i.e. flat structure
text_config_dict = base_config_dict.pop("text_config")
flat_config_dict = {**text_config_dict, **base_config_dict}
config_from_flat_dict = Qwen2_5_VLConfig(**flat_config_dict)
config_from_flat_dict.vocab_size = 78
self.assertEqual(config_from_flat_dict.vocab_size, 78)
self.assertEqual(config_from_flat_dict.text_config.vocab_size, 78)

# Vision config attributes are NOT force-set via vision config
base_config.patch_size = 8
self.assertEqual(base_config.patch_size, 8)
self.assertNotEqual(base_config.vision_config.patch_size, 8)

# Test for making sure config save and load preserves correct model type
config, _ = self.model_tester.prepare_config_and_inputs_for_common()

self.assertEqual(config.model_type, "qwen2_5_vl")

with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(tmp_dir)

loaded_config = Qwen2_5_VLConfig.from_pretrained(tmp_dir)
self.assertEqual(loaded_config.model_type, "qwen2_5_vl")

def test_mismatching_num_image_tokens(self):
"""
Tests that VLMs through an error with explicit message saying what is wrong
Expand Down
38 changes: 0 additions & 38 deletions tests/models/qwen2_vl/test_modeling_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,44 +192,6 @@ def setUp(self):
def test_config(self):
self.config_tester.run_common_tests()

def test_text_config(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
base_config_dict = config.to_dict()
base_config = Qwen2VLConfig(**base_config_dict)

# Trying to get or set text related attributes happens via text config
vocab_size = base_config.vocab_size
text_vocab_size = base_config.text_config.vocab_size
self.assertEqual(vocab_size, text_vocab_size)

base_config.vocab_size = 55
self.assertEqual(base_config.vocab_size, 55)
self.assertEqual(base_config.text_config.vocab_size, 55)

# We can still initialize config from old-format json, i.e. flat structure
text_config_dict = base_config_dict.pop("text_config")
flat_config_dict = {**text_config_dict, **base_config_dict}
config_from_flat_dict = Qwen2VLConfig(**flat_config_dict)
config_from_flat_dict.vocab_size = 78
self.assertEqual(config_from_flat_dict.vocab_size, 78)
self.assertEqual(config_from_flat_dict.text_config.vocab_size, 78)

# Vision config attributes are NOT force-set via vision config
base_config.patch_size = 8
self.assertEqual(base_config.patch_size, 8)
self.assertNotEqual(base_config.vision_config.patch_size, 8)

# Test for making sure config save and load preserves correct model type
config, _ = self.model_tester.prepare_config_and_inputs_for_common()

self.assertEqual(config.model_type, "qwen2_vl")

with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(tmp_dir)

loaded_config = Qwen2VLConfig.from_pretrained(tmp_dir)
self.assertEqual(loaded_config.model_type, "qwen2_vl")

def test_mismatching_num_image_tokens(self):
"""
Tests that VLMs through an error with explicit message saying what is wrong
Expand Down