Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion MIGRATION_GUIDE_V5.md
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained(

- Methods to init a nested config such as `from_xxx_config` are deleted. Configs can be init from the `__init__` method in the same way. See [#41314](https://github.com/huggingface/transformers/pull/41314).
- It is no longer possible to load a config class from a URL file. Configs must be loaded from either a local path or a repo on the Hub. See [#42383](https://github.com/huggingface/transformers/pull/42383).
- All parameters for configuring model's rotary embedding are now stored under `mode.rope_parameters`, including the `rope_theta` and `rope_type`. Model's `config.rope_parameters` is a simple dictionaty in most cases, and can also be a nested dict in special cases (i.e. Gemma3 and ModernBert) with different rope parameterization for each layer type. See [#39847](https://github.com/huggingface/transformers/pull/39847)
- All parameters for configuring model's rotary embedding are now stored under `mode.rope_parameters`, including the `rope_theta` and `rope_type`. Model's `config.rope_parameters` is a simple dictionaty in most cases, and can also be a nested dict in special cases (i.e. Gemma3 and ModernBert) with different rope parameterization for each layer type. Trying to get `config.rope_theta` will throw an attribute error from now on. See [#39847](https://github.com/huggingface/transformers/pull/39847) and [#42255](https://github.com/huggingface/transformers/pull/42255)
- Qwen-VL family configuration is in a nested format and trying to access keys directly will throw an error (e.g. `config.vocab_size`). Users are expected to access keys from their respective sub-configs (`config.text_config.vocab_size`).

## Processing
Expand Down
10 changes: 9 additions & 1 deletion src/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from . import __version__
from .dynamic_module_utils import custom_object_save
from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
from .modeling_rope_utils import RotaryEmbeddingConfigMixin
from .utils import (
CONFIG_NAME,
PushToHubMixin,
Expand All @@ -49,7 +50,7 @@
SpecificPreTrainedConfigType = TypeVar("SpecificPreTrainedConfigType", bound="PreTrainedConfig")


class PreTrainedConfig(PushToHubMixin):
class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
# no-format
r"""
Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
Expand Down Expand Up @@ -261,6 +262,13 @@ def __init__(

dtype = getattr(torch, dtype)

# BC for rotary embeddings. We will pop out legacy keys from kwargs and rename to new format
if hasattr(self, "rope_parameters"):
ignore_keys_at_rope_validation = kwargs.pop("ignore_keys_at_rope_validation", None)
kwargs = self.convert_rope_params_to_dict(
ignore_keys_at_rope_validation=ignore_keys_at_rope_validation, **kwargs
)

# Attributes common for all models
self.return_dict = return_dict
self.output_hidden_states = output_hidden_states
Expand Down
654 changes: 316 additions & 338 deletions src/transformers/modeling_rope_utils.py

Large diffs are not rendered by default.

12 changes: 3 additions & 9 deletions src/transformers/models/apertus/configuration_apertus.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters


class ApertusConfig(PreTrainedConfig):
Expand Down Expand Up @@ -99,6 +99,7 @@ class ApertusConfig(PreTrainedConfig):

model_type = "apertus"
keys_to_ignore_at_inference = ["past_key_values"]
default_theta = 12000000.0
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
Expand Down Expand Up @@ -160,14 +161,7 @@ def __init__(
self.use_cache = use_cache
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 12000000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)
self.rope_parameters = rope_parameters

super().__init__(
pad_token_id=pad_token_id,
Expand Down
55 changes: 30 additions & 25 deletions src/transformers/models/apertus/modular_apertus.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
from torch import nn

from ...cache_utils import Cache
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, logging
from ..llama.configuration_llama import LlamaConfig
from ..llama.modeling_llama import (
LlamaAttention,
LlamaDecoderLayer,
Expand All @@ -43,7 +43,7 @@
logger = logging.get_logger(__name__)


class ApertusConfig(LlamaConfig):
class ApertusConfig(PreTrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
Expand Down Expand Up @@ -116,6 +116,8 @@ class ApertusConfig(LlamaConfig):
```"""

model_type = "apertus"
keys_to_ignore_at_inference = ["past_key_values"]
default_theta = 12000000.0
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
Expand All @@ -124,6 +126,11 @@ class ApertusConfig(LlamaConfig):
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}

def __init__(
self,
Expand Down Expand Up @@ -154,35 +161,33 @@ def __init__(
attention_dropout: Optional[float] = 0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads

# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads

self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.rope_parameters = rope_parameters

super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
rms_norm_eps=rms_norm_eps,
use_cache=use_cache,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
rope_parameters=rope_parameters,
attention_bias=attention_bias,
attention_dropout=attention_dropout,
**kwargs,
)
del self.pretraining_tp
del self.mlp_bias
del self.head_dim

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 12000000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)


class ApertusMLP(NemotronMLP):
Expand Down
11 changes: 2 additions & 9 deletions src/transformers/models/arcee/configuration_arcee.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters


class ArceeConfig(PreTrainedConfig):
Expand Down Expand Up @@ -163,14 +163,7 @@ def __init__(
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 10000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)
self.rope_parameters = rope_parameters

super().__init__(
pad_token_id=pad_token_id,
Expand Down
11 changes: 2 additions & 9 deletions src/transformers/models/aria/configuration_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters
from ..auto import CONFIG_MAPPING, AutoConfig


Expand Down Expand Up @@ -168,14 +168,7 @@ def __init__(
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 10000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)
self.rope_parameters = rope_parameters

super().__init__(
pad_token_id=pad_token_id,
Expand Down
14 changes: 3 additions & 11 deletions src/transformers/models/bamba/configuration_bamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters
from ...utils import logging


Expand Down Expand Up @@ -171,16 +171,6 @@ def __init__(
self.num_logits_to_keep = num_logits_to_keep

self.attn_layer_indices = attn_layer_indices
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
self.partial_rotary_factor = 0.5
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 10000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)

mamba_intermediate = mamba_expand * hidden_size

if mamba_intermediate % mamba_n_heads != 0:
Expand All @@ -203,6 +193,8 @@ def __init__(
self.mamba_conv_bias = mamba_conv_bias
self.mamba_proj_bias = mamba_proj_bias
self.z_loss_coefficient = z_loss_coefficient
self.rope_parameters = rope_parameters
kwargs["partial_rotary_factor"] = 0.5 # hardcode for BC

super().__init__(
pad_token_id=pad_token_id,
Expand Down
12 changes: 3 additions & 9 deletions src/transformers/models/bitnet/configuration_bitnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters
from ...utils import logging


Expand Down Expand Up @@ -97,6 +97,7 @@ class BitNetConfig(PreTrainedConfig):

model_type = "bitnet"
keys_to_ignore_at_inference = ["past_key_values"]
default_theta = 500000.0

def __init__(
self,
Expand Down Expand Up @@ -138,14 +139,7 @@ def __init__(
self.use_cache = use_cache
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 500000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)
self.rope_parameters = rope_parameters

super().__init__(
pad_token_id=pad_token_id,
Expand Down
Loading
Loading