-Original file line number
+Diff line change
@@ Expand Up / @@ -118,6 +118,10 @@ class Qwen3MoeConfig(PreTrainedConfig): @@
         model_type = "qwen3_moe"
         keys_to_ignore_at_inference = ["past_key_values"]
+        attribute_map = {
+            "num_experts": "num_local_experts",
+        }
         # Default tensor parallel plan for base model `Qwen3Moe`
         base_model_tp_plan = {
             "layers.*.self_attn.q_proj": "colwise",
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -720,6 +720,10 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): @@
         model_type = "qwen3_omni_moe_talker_text"
         keys_to_ignore_at_inference = ["past_key_values"]
+        attribute_map = {
+            "num_experts": "num_local_experts",
+        }
         # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerText`
         base_model_tp_plan = {
             "layers.*.self_attn.q_proj": "colwise",
@@ Expand Down @@

tests/quantization/finegrained_fp8/test_fp8.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -18,6 +18,8 @@ @@
     from contextlib import ExitStack, contextmanager
     from unittest.mock import patch
+    from parameterized import parameterized
     from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FineGrainedFP8Config, OPTForCausalLM
     from transformers.quantizers.quantizer_finegrained_fp8 import FineGrainedFP8HfQuantizer
     from transformers.testing_utils import (
@@ Expand Down Expand Up / @@ -137,6 +139,16 @@ def tearDown(self): @@
             backend_empty_cache(torch_device)
             gc.collect()
+        @parameterized.expand(
+            [
+                "hf-internal-testing/tiny-random-Qwen3MoeForCausalLM",
+                "hf-internal-testing/tiny-random-MixtralForCausalLM",
+            ]
+        )
+        def test_moe_conversion_doesnt_raise(self, model_id):
+            quantization_config = FineGrainedFP8Config(weight_block_size=(32, 32))
+            AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
         def test_quantized_model_conversion(self):
             """
             Simple test that checks if the quantized model has been converted properly
@@ Expand Down @@

Fix loading of Qwen3 FP8 #43494

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

Jan 27, 2026

+20 −0

-Original file line number
+Diff line change
@@ Expand Up / @@ -118,6 +118,10 @@ class Qwen3MoeConfig(PreTrainedConfig): @@
         model_type = "qwen3_moe"
         keys_to_ignore_at_inference = ["past_key_values"]
+        attribute_map = {
+            "num_experts": "num_local_experts",
+        }
         # Default tensor parallel plan for base model `Qwen3Moe`
         base_model_tp_plan = {
             "layers.*.self_attn.q_proj": "colwise",
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -720,6 +720,10 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): @@
         model_type = "qwen3_omni_moe_talker_text"
         keys_to_ignore_at_inference = ["past_key_values"]
+        attribute_map = {
+            "num_experts": "num_local_experts",
+        }
         # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerText`
         base_model_tp_plan = {
             "layers.*.self_attn.q_proj": "colwise",
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -18,6 +18,8 @@ @@
     from contextlib import ExitStack, contextmanager
     from unittest.mock import patch
+    from parameterized import parameterized
     from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FineGrainedFP8Config, OPTForCausalLM
     from transformers.quantizers.quantizer_finegrained_fp8 import FineGrainedFP8HfQuantizer
     from transformers.testing_utils import (
@@ Expand Down Expand Up / @@ -137,6 +139,16 @@ def tearDown(self): @@
             backend_empty_cache(torch_device)
             gc.collect()
+        @parameterized.expand(
+            [
+                "hf-internal-testing/tiny-random-Qwen3MoeForCausalLM",
+                "hf-internal-testing/tiny-random-MixtralForCausalLM",
+            ]
+        )
+        def test_moe_conversion_doesnt_raise(self, model_id):
+            quantization_config = FineGrainedFP8Config(weight_block_size=(32, 32))
+            AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
         def test_quantized_model_conversion(self):
             """
             Simple test that checks if the quantized model has been converted properly
@@ Expand Down @@

Provide feedback