Skip to content

tie_word_embeddings not saved on customized model #38160

@Tavish9

Description

@Tavish9

System Info

transformers_version: 4.52.0.dev0
python: 3.10

Who can help?

@ArthurZucker @SunMarc @qubvel @zucchini-nlp

Information

  • The official example scripts
  • My own modified scripts

Tasks

  • An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
  • My own task or dataset (give details below)

Reproduction

from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel, Trainer, TrainingArguments


class MyConfig(PretrainedConfig):
    model_type = "test"
    sub_configs = {"text_config": AutoConfig}

    def __init__(
        self,
        text_config=None,
        **kwargs,
    ):
        self.text_config = text_config

        super().__init__(**kwargs)

class MyModel(PreTrainedModel):
    config_class = MyConfig

    def __init__(self, config, language_model=None) -> None:
        super().__init__(config)
        self.language_model = language_model

        self.post_init()

text_config = AutoConfig.from_pretrained("Qwen/Qwen3-8B")
config = MyConfig(text_config=text_config)
language_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B")
model = MyModel(config, language_model)

trainer = Trainer(
    model=model,
    args=TrainingArguments(),
)

trainer.save_model()

Expected behavior

The saved config.json has no tie_word_embeddings key

Expected:

{
  "architectures": [
    "MyModel"
  ],
  "model_type": "test",
  "text_config": {
    "_name_or_path": "Qwen/Qwen3-8B",
    "architectures": [
      "Qwen3ForCausalLM"
    ],
    "attention_bias": false,
    "attention_dropout": 0.0,
    "bos_token_id": 151643,
    "eos_token_id": 151645,
    "head_dim": 128,
    "hidden_act": "silu",
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "max_position_embeddings": 40960,
    "max_window_layers": 28,
    "model_type": "qwen3",
    "num_attention_heads": 16,
    "num_hidden_layers": 28,
    "num_key_value_heads": 8,
    "rms_norm_eps": 1e-06,
    "rope_scaling": null,
    "rope_theta": 1000000,
    "sliding_window": null,
    "tie_word_embeddings": false,
    "torch_dtype": "bfloat16",
    "use_cache": true,
    "use_sliding_window": false,
    "vocab_size": 151936
  },
  "torch_dtype": "float32",
  "transformers_version": "4.52.0.dev0"
}

But:

{
  "architectures": [
    "MyModel"
  ],
  "model_type": "test",
  "text_config": {
    "_name_or_path": "Qwen/Qwen3-8B",
    "architectures": [
      "Qwen3ForCausalLM"
    ],
    "attention_bias": false,
    "attention_dropout": 0.0,
    "bos_token_id": 151643,
    "eos_token_id": 151645,
    "head_dim": 128,
    "hidden_act": "silu",
    "hidden_size": 4096,
    "initializer_range": 0.02,
    "intermediate_size": 12288,
    "max_position_embeddings": 40960,
    "max_window_layers": 36,
    "model_type": "qwen3",
    "num_attention_heads": 32,
    "num_hidden_layers": 36,
    "num_key_value_heads": 8,
    "rms_norm_eps": 1e-06,
    "rope_scaling": null,
    "rope_theta": 1000000,
    "sliding_window": null,
-   "tie_word_embeddings": false,
    "torch_dtype": "bfloat16",
    "use_cache": true,
    "use_sliding_window": false,
    "vocab_size": 151936
  },
  "torch_dtype": "float32",
  "transformers_version": "4.52.0.dev0"
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions