intel · yiliu30 · Apr 28, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/neural_compressor/common/utils/__init__.py b/neural_compressor/common/utils/__init__.py
@@ -21,27 +21,6 @@
 from neural_compressor.common.utils.utility import *
 
 
-# FIXME: (Yi) REMOVE BELOW CODE
-import os
-
-DEEPSEEK_EXPERTS = 256
-VLLM_TP_SIZE = int(os.getenv("VLLM_TP_SIZE", "8"))
-VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", VLLM_TP_SIZE))
-NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE  # 32
-VLLM_MOE_N_SLICE = int(os.getenv("VLLM_MOE_N_SLICE", 8))
-NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // VLLM_MOE_N_SLICE # 4
-FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK  # 4
-
-logger.warning_once(
-    (
-        f"INC uses VLLM_TP_SIZE={VLLM_TP_SIZE},\n"
-        f"VLLM_EP_SIZE={VLLM_EP_SIZE},\n"
-        f"NUM_EXPERTS_PER_EP_RANK={NUM_EXPERTS_PER_EP_RANK},\n"
-        f"VLLM_MOE_N_SLICE={VLLM_MOE_N_SLICE},\n"
-        f"NUM_EXPERTS_PER_GROUP_PER_RANK={NUM_EXPERTS_PER_GROUP_PER_RANK},\n"
-        f"FUSED_MOE_EXPERTS={FUSED_MOE_EXPERTS}"
-    )
-)
 
 import sys
 import pdb

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -32,7 +32,6 @@
     get_patched_module_table,
     get_patched_module_type_table,
 )
-from neural_compressor.common import utils as inc_utils
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator, INCAcceleratorType
 deepspeed_exists = False
 if importlib.util.find_spec("deepspeed"):  # check if deepspeed is installed
@@ -59,7 +58,7 @@ def maybe_dequant_original_fp8_weight(mod: torch.nn.Module, param: torch.Tensor)
     "dynamic_moe": ModuleType(
         1,
         [],
-        inc_utils.FUSED_MOE_EXPERTS + 1,  # FIXME (Yi) # one output, FUSED_MOE_EXPERTS weights
+        8 + 1,
         True,
     ),
 }
@@ -235,12 +234,12 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev
     "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),
     # FIXME (Yi) revert change
     "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False),
-    # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
-    # "VllmMixtureOfExpertsOp": (
-    #     ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2)
-    #     if os.getenv("LOW_CPU_MEM", "0") == "1"
-    #     else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1)
-    # ),
+    "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
+    "VllmMixtureOfExpertsOp": (
+        ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2)
+        if os.getenv("LOW_CPU_MEM", "0") == "1"
+        else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1)
+    ),
 }
 
 

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -781,6 +781,7 @@ def forward_quant(self,
                       hidden_states,
                       expert_routing_table,
                       router_weights,
+                      layer=None,
                       permuted_weights=True,
                       activation="silu"):
         experts_range = range(self.num_experts)
@@ -810,6 +811,7 @@ def forward_measure(self,
                         hidden_states,
                         expert_routing_table,
                         router_weights,
+                        layer=None,
                         permuted_weights=True,
                         activation="silu"):
         experts_range = range(self.num_experts)