-
Notifications
You must be signed in to change notification settings - Fork 31.7k
Closed
Labels
Description
System Info
transformers_version: 4.52.0.dev0
python: 3.10
Who can help?
@ArthurZucker @SunMarc @qubvel @zucchini-nlp
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel, Trainer, TrainingArguments
class MyConfig(PretrainedConfig):
model_type = "test"
sub_configs = {"text_config": AutoConfig}
def __init__(
self,
text_config=None,
**kwargs,
):
self.text_config = text_config
super().__init__(**kwargs)
class MyModel(PreTrainedModel):
config_class = MyConfig
def __init__(self, config, language_model=None) -> None:
super().__init__(config)
self.language_model = language_model
self.post_init()
text_config = AutoConfig.from_pretrained("Qwen/Qwen3-8B")
config = MyConfig(text_config=text_config)
language_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B")
model = MyModel(config, language_model)
trainer = Trainer(
model=model,
args=TrainingArguments(),
)
trainer.save_model()Expected behavior
The saved config.json has no tie_word_embeddings key
Expected:
{
"architectures": [
"MyModel"
],
"model_type": "test",
"text_config": {
"_name_or_path": "Qwen/Qwen3-8B",
"architectures": [
"Qwen3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 40960,
"max_window_layers": 28,
"model_type": "qwen3",
"num_attention_heads": 16,
"num_hidden_layers": 28,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936
},
"torch_dtype": "float32",
"transformers_version": "4.52.0.dev0"
}But:
{
"architectures": [
"MyModel"
],
"model_type": "test",
"text_config": {
"_name_or_path": "Qwen/Qwen3-8B",
"architectures": [
"Qwen3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 12288,
"max_position_embeddings": 40960,
"max_window_layers": 36,
"model_type": "qwen3",
"num_attention_heads": 32,
"num_hidden_layers": 36,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000,
"sliding_window": null,
- "tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936
},
"torch_dtype": "float32",
"transformers_version": "4.52.0.dev0"
}