From 11d3c7d4eec7e752958977ca5236e8086dd31eaa Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Thu, 16 Apr 2026 16:40:34 +0800
Subject: [PATCH 1/5] fp8_block bug fix (#1693)

---
 auto_round/export/export_to_autoround/export_to_fp8.py | 2 +-
 test/test_cpu/export/test_llmc_format.py               | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
index fadebba5d..38af5b7b5 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -175,7 +175,7 @@ def pack_layer(layer_name, model, data_type, device=None, unsqueeze=False):
     )
     if (
         unsqueeze
-        and isinstance(linear_cls, FP8QLinear)
+        and isinstance(my_linear, FP8QLinear)
         and len(my_linear.weight_scale.shape)
         and my_linear.weight_scale.shape[0] != 1
     ):
diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py
index ebc1bd87c..02c062d7e 100644
--- a/test/test_cpu/export/test_llmc_format.py
+++ b/test/test_cpu/export/test_llmc_format.py
@@ -63,12 +63,16 @@ def test_llmcompressor_fp8(self):
 
         import json
 
+        from safetensors import safe_open
         config = json.load(open(os.path.join(self.save_dir, "config.json")))
         assert "group_0" in config["quantization_config"]["config_groups"]
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
         assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel"
         assert config["quantization_config"]["quant_method"] == "compressed-tensors"
 
+        f = safe_open(os.path.join(self.save_dir, "model.safetensors"), framework="pt")
+        assert len(f.get_tensor("model.decoder.layers.0.fc1.weight_scale").shape) == 2
+
     def test_autoround_llmcompressor_fp8(self):
         ## quantize the model
         model_name = opt_name_or_path

From 7ea88e70c0f051b7e870d719845e87678220e32e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 Apr 2026 06:00:50 +0000
Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/test_cpu/export/test_llmc_format.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py
index 02c062d7e..75b852af3 100644
--- a/test/test_cpu/export/test_llmc_format.py
+++ b/test/test_cpu/export/test_llmc_format.py
@@ -64,6 +64,7 @@ def test_llmcompressor_fp8(self):
         import json
 
         from safetensors import safe_open
+
         config = json.load(open(os.path.join(self.save_dir, "config.json")))
         assert "group_0" in config["quantization_config"]["config_groups"]
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8

From 38bbb223a21112a332d2fa9c6a47ea12d8bcfac3 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 24 Apr 2026 14:15:07 +0800
Subject: [PATCH 3/5] fix branch

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/code-scan.yml          | 2 +-
 .azure-pipelines/compatibility-test.yml | 2 +-
 .azure-pipelines/performance-test.yaml  | 2 +-
 .azure-pipelines/unit-test-cuda.yml     | 2 +-
 .azure-pipelines/unit-test-hpu.yml      | 2 +-
 .azure-pipelines/unit-test-xpu.yml      | 2 +-
 .azure-pipelines/unit-test.yml          | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.azure-pipelines/code-scan.yml b/.azure-pipelines/code-scan.yml
index 028a0d086..e18414b0f 100644
--- a/.azure-pipelines/code-scan.yml
+++ b/.azure-pipelines/code-scan.yml
@@ -6,7 +6,7 @@ pr:
   branches:
     include:
       - main
-      - 'v*.rc'
+      - 'v*rc'
   paths:
     include:
       - auto_round
diff --git a/.azure-pipelines/compatibility-test.yml b/.azure-pipelines/compatibility-test.yml
index 5bc47f5e3..b314d2c8b 100644
--- a/.azure-pipelines/compatibility-test.yml
+++ b/.azure-pipelines/compatibility-test.yml
@@ -6,7 +6,7 @@ pr:
   branches:
     include:
       - main
-      - 'v*.rc'
+      - 'v*rc'
   paths:
     include:
       - auto_round
diff --git a/.azure-pipelines/performance-test.yaml b/.azure-pipelines/performance-test.yaml
index 694c38162..1cc5a9ca8 100644
--- a/.azure-pipelines/performance-test.yaml
+++ b/.azure-pipelines/performance-test.yaml
@@ -6,7 +6,7 @@ pr:
   branches:
     include:
       - main
-      - 'v*.rc'
+      - 'v*rc'
   paths:
     include:
       - auto_round
diff --git a/.azure-pipelines/unit-test-cuda.yml b/.azure-pipelines/unit-test-cuda.yml
index 9b7c13cb6..a9040db6f 100644
--- a/.azure-pipelines/unit-test-cuda.yml
+++ b/.azure-pipelines/unit-test-cuda.yml
@@ -6,7 +6,7 @@ pr:
   branches:
     include:
       - main
-      - 'v*.rc'
+      - 'v*rc'
   paths:
     include:
       - auto_round
diff --git a/.azure-pipelines/unit-test-hpu.yml b/.azure-pipelines/unit-test-hpu.yml
index d9e0a263e..5da27231f 100644
--- a/.azure-pipelines/unit-test-hpu.yml
+++ b/.azure-pipelines/unit-test-hpu.yml
@@ -6,7 +6,7 @@ pr:
   branches:
     include:
       - main
-      - 'v*.rc'
+      - 'v*rc'
   paths:
     include:
       - auto_round
diff --git a/.azure-pipelines/unit-test-xpu.yml b/.azure-pipelines/unit-test-xpu.yml
index 5d9b081e3..e90536ca1 100644
--- a/.azure-pipelines/unit-test-xpu.yml
+++ b/.azure-pipelines/unit-test-xpu.yml
@@ -6,7 +6,7 @@ pr:
   branches:
     include:
       - main
-      - 'v*.rc'
+      - 'v*rc'
   paths:
     include:
       - auto_round
diff --git a/.azure-pipelines/unit-test.yml b/.azure-pipelines/unit-test.yml
index bd28294a2..1d4536b16 100644
--- a/.azure-pipelines/unit-test.yml
+++ b/.azure-pipelines/unit-test.yml
@@ -6,7 +6,7 @@ pr:
   branches:
     include:
       - main
-      - 'v*.rc'
+      - 'v*rc'
   paths:
     include:
       - auto_round

From a10c1aa8840f889cb3812a8ed1736b27c8ed7c3b Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Apr 2026 08:48:21 +0000
Subject: [PATCH 4/5] Backport remove threaded packing from exporters

---
 .../export/export_to_autogptq/export.py       | 19 +------
 .../export/export_to_autoround/export.py      | 18 +------
 .../export_to_autoround/export_to_fp8.py      | 18 +------
 .../export_to_nvfp_mxfp.py                    | 18 +------
 auto_round/export/export_to_awq/export.py     | 18 +------
 .../export_to_llmcompressor/export_to_fp.py   | 18 +------
 .../export_to_static_fp.py                    | 24 +--------
 requirements-hpu.txt                          |  1 -
 requirements.txt                              |  1 -
 test/helpers.py                               | 13 +++++
 test/test_cpu/export/test_export.py           | 49 ++++++++++++++++++-
 test/test_cpu/export/test_llmc_format.py      | 32 +++++++++++-
 test/test_cpu/quantization/test_mxfp_nvfp.py  |  7 +--
 13 files changed, 110 insertions(+), 126 deletions(-)

diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
index 98fde7b73..5594b5979 100644
--- a/auto_round/export/export_to_autogptq/export.py
+++ b/auto_round/export/export_to_autogptq/export.py
@@ -16,12 +16,9 @@
 import inspect
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import fields
 from typing import Any, Callable, Dict, Union
 
-import threadpoolctl as tctl
-
 # MIT License
 #
 # Copyright (c) 2023 潘其威(William)
@@ -295,21 +292,9 @@ def save_quantized_as_autogptq(
         model = copy.deepcopy(model.to("cpu"))
 
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            with tqdm(total=len(names), leave=True) as pbar:
-
-                def wrapper(name):
-                    pbar.set_description(f"packing {name}")
-                    with tctl.threadpool_limits(limits=1):
-                        pack_layer(name, model, backend, device)
-                    pbar.update(1)
-
-                for _ in executor.map(wrapper, names):
-                    pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, model, backend, device)
     if output_dir is None:
         return model
     quantization_config["lm_head"] = lm_head_quantized
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 372638e99..60613b7e8 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -18,12 +18,10 @@
 import inspect
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import fields
 from enum import Enum
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 import transformers
@@ -309,21 +307,9 @@ def save_quantized_as_autoround(
         quantization_config["extra_config"] = extra_config
 
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            with tqdm(total=len(names), leave=True) as pbar:
-
-                def wrapper(name):
-                    pbar.set_description(f"packing {name}")
-                    with tctl.threadpool_limits(limits=1):
-                        pack_layer(name, model, backend, device)
-                    pbar.update(1)
-
-                for _ in executor.map(wrapper, names):
-                    pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, model, backend, device)
     filter_quantization_config(quantization_config)
     if hasattr(model, "config"):
         model.config.quantization_config = quantization_config
diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
index 38af5b7b5..90c228583 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -15,11 +15,9 @@
 import copy
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import fields
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import transformers
 from tqdm import tqdm
@@ -240,20 +238,8 @@ def save_quantized_as_autoround(
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        with tqdm(total=len(names), leave=True) as pbar:
-
-            def wrapper(name):
-                pbar.set_description(f"packing {name}")
-                with tctl.threadpool_limits(limits=1):
-                    pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
-                pbar.update(1)
-
-            for _ in executor.map(wrapper, names):
-                pass
+    for name in tqdm(names, desc="packing", leave=True):
+        pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
     regex_config = quantization_config.pop("regex_config")
     if regex_config is not None:
         for name in regex_config.keys():
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
index 502c49676..f3f18cf03 100644
--- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
+++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -16,11 +16,9 @@
 import inspect
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import fields
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 import transformers
@@ -235,20 +233,8 @@ def save_quantized_as_fp(
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        with tqdm(total=len(names), leave=True) as pbar:
-
-            def wrapper(name):
-                pbar.set_description(f"packing {name}")
-                with tctl.threadpool_limits(limits=1):
-                    pack_layer(name, model, backend, device)
-                pbar.update(1)
-
-            for _ in executor.map(wrapper, names):
-                pass
+    for name in tqdm(names, desc="packing", leave=True):
+        pack_layer(name, model, backend, device)
     filter_quantization_config(quantization_config)
 
     if hasattr(model, "config"):
diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
index 1fedf3c60..cc178097a 100644
--- a/auto_round/export/export_to_awq/export.py
+++ b/auto_round/export/export_to_awq/export.py
@@ -23,10 +23,8 @@
 import copy
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 from tqdm import tqdm
@@ -179,21 +177,9 @@ def save_quantized_as_autoawq(
     names = list(layer_config.keys())
 
     backend = None
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            with tqdm(total=len(names), leave=True) as pbar:
-
-                def wrapper(name):
-                    pbar.set_description(f"packing {name}")
-                    with tctl.threadpool_limits(limits=1):
-                        pack_layer(name, compressed_model, backend, device)
-                    pbar.update(1)
-
-                for _ in executor.map(wrapper, names):
-                    pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, compressed_model, backend, device)
     if output_dir is None:
         return model
 
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
index 05c01ee3a..b2a017189 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -16,10 +16,8 @@
 import inspect
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 import transformers
@@ -190,21 +188,9 @@ def save_quantized_as_fp(
             update_fused_layer_global_scales(module, base_name="input")
 
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() or not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            with tqdm(total=len(names), leave=True) as pbar:
-
-                def wrapper(name):
-                    pbar.set_description(f"packing {name}")
-                    with tctl.threadpool_limits(limits=1):
-                        pack_layer(name, model, device)
-                    pbar.update(1)
-
-                for _ in executor.map(wrapper, names):
-                    pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, model, device)
 
     ignore = generate_ignore_regex_list(regex_config=regex_config, layer_config=layer_config)
 
diff --git a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py
index 6c534f8fc..c733c4510 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py
@@ -16,10 +16,8 @@
 import json
 import os
 import sys
-from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import transformers
 from tqdm import tqdm
@@ -36,7 +34,6 @@
     get_module,
     get_packing_device,
     is_gaudi2,
-    is_hpex_available,
     logger,
     set_module,
     unsupported_meta_device,
@@ -151,26 +148,9 @@ def save_quantized_as_static_fp(
     image_processor = kwargs.get("image_processor", None)
 
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-
-        if is_hpex_available():  # packing will cause hang occasionally on hpu
-            for name in tqdm(names, total=len(names), leave=True, desc="packing"):
-                pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
-        else:
-            with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                with tqdm(total=len(names), leave=True) as pbar:
-
-                    def wrapper(name):
-                        pbar.set_description(f"packing {name}")
-                        with tctl.threadpool_limits(limits=1):
-                            pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
-                        pbar.update(1)
-
-                    for _ in executor.map(wrapper, names):
-                        pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
 
     # Get llm-compressor format config
     check_compressed_tensors_supported()
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index d2704fd1a..0fbe967a4 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -7,4 +7,3 @@ tqdm
 packaging
 pillow
 transformers
-threadpoolctl
diff --git a/requirements.txt b/requirements.txt
index b7abf7181..1bb587b64 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,6 @@ accelerate
 datasets
 numpy
 py-cpuinfo
-threadpoolctl
 torch
 tqdm
 transformers>=4.38
diff --git a/test/helpers.py b/test/helpers.py
index a45c953f5..f3872060a 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import copy
 import os
 import re
@@ -14,6 +15,18 @@
 transformers_version = version.parse(transformers.__version__)
 
 
+def _raise_threaded_packing(*args, **kwargs):
+    raise AssertionError("Packing should not create a thread pool or call threadpoolctl.")
+
+
+def forbid_threaded_packing(monkeypatch, module):
+    monkeypatch.setattr(concurrent.futures, "ThreadPoolExecutor", _raise_threaded_packing)
+    monkeypatch.setattr(module, "ThreadPoolExecutor", _raise_threaded_packing, raising=False)
+    tctl = getattr(module, "tctl", None)
+    if tctl is not None:
+        monkeypatch.setattr(tctl, "threadpool_limits", _raise_threaded_packing)
+
+
 def generate_prompt(model_obj_or_str, tokenizer=None, text="The capital of France is,", max_new_tokens=10, device=None):
     """Generate text using a model and tokenizer.
 
diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py
index 2ef2827b2..7fe8a6140 100644
--- a/test/test_cpu/export/test_export.py
+++ b/test/test_cpu/export/test_export.py
@@ -8,8 +8,12 @@
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
+from auto_round.export.export_to_autogptq import export as autogptq_export
+from auto_round.export.export_to_autoround import export as autoround_export
+from auto_round.export.export_to_autoround import export_to_fp8 as autoround_fp8_export
+from auto_round.export.export_to_awq import export as awq_export
 
-from ...helpers import get_model_path, opt_name_or_path, transformers_version
+from ...helpers import forbid_threaded_packing, get_model_path, opt_name_or_path, transformers_version
 
 
 def _get_folder_size(path: str) -> float:
@@ -496,3 +500,46 @@ def test_llmc_dynamic_wint8aint8_export_with_tuning(self, dataloader):
         with safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") as f:
             assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
             assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.int8
+
+
+@pytest.mark.parametrize(
+    "format_name,export_module,sym",
+    [
+        ("auto_gptq", autogptq_export, False),
+        ("auto_awq", awq_export, False),
+        ("auto_round", autoround_export, True),
+    ],
+)
+def test_weight_only_exports_pack_serially(tiny_opt_model_path, tmp_path, monkeypatch, format_name, export_module, sym):
+    autoround = AutoRound(
+        tiny_opt_model_path,
+        bits=4,
+        group_size=128,
+        sym=sym,
+        iters=0,
+        disable_opt_rtn=True,
+    )
+    autoround.quantize()
+    forbid_threaded_packing(monkeypatch, export_module)
+    autoround.save_quantized(output_dir=tmp_path, inplace=False, format=format_name)
+    assert os.path.exists(os.path.join(tmp_path, "config.json"))
+
+
+def test_fp8_autoround_export_packs_serially(tiny_opt_model_path, tmp_path, monkeypatch):
+    from safetensors import safe_open
+
+    autoround = AutoRound(
+        tiny_opt_model_path,
+        bits=8,
+        group_size=-1,
+        iters=0,
+        scheme="FP8_STATIC",
+        nsamples=2,
+        seqlen=2,
+        static_kv_dtype="fp8",
+    )
+    autoround.quantize()
+    forbid_threaded_packing(monkeypatch, autoround_fp8_export)
+    autoround.save_quantized(output_dir=tmp_path, format="auto_round")
+    with safe_open(os.path.join(tmp_path, "model.safetensors"), framework="pt") as f:
+        assert "model.decoder.layers.0.self_attn.k_proj.weight_scale" in f.keys()
diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py
index 75b852af3..725cb6b05 100644
--- a/test/test_cpu/export/test_llmc_format.py
+++ b/test/test_cpu/export/test_llmc_format.py
@@ -6,8 +6,10 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
+from auto_round.export.export_to_llmcompressor import export_to_fp as llmc_fp_export
+from auto_round.export.export_to_llmcompressor import export_to_static_fp as llmc_static_fp_export
 
-from ...helpers import get_model_path, opt_name_or_path
+from ...helpers import forbid_threaded_packing, get_model_path, opt_name_or_path
 
 
 class TestLLMC:
@@ -95,3 +97,31 @@ def test_autoround_llmcompressor_fp8(self):
         assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor"
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"] == "tensor"
         assert config["quantization_config"]["quant_method"] == "compressed-tensors"
+
+
+def test_llmcompressor_static_fp_export_packs_serially(tiny_opt_model_path, tmp_path, monkeypatch):
+    autoround = AutoRound(
+        tiny_opt_model_path,
+        scheme="FP8_STATIC",
+        seqlen=8,
+        nsamples=2,
+        iters=0,
+    )
+    autoround.quantize()
+    forbid_threaded_packing(monkeypatch, llmc_static_fp_export)
+    autoround.save_quantized(tmp_path, format="llm_compressor")
+    assert os.path.exists(os.path.join(tmp_path, "config.json"))
+
+
+def test_llmcompressor_mxfp8_export_packs_serially(tmp_path, monkeypatch):
+    autoround = AutoRound(
+        model=opt_name_or_path,
+        iters=0,
+        disable_opt_rtn=True,
+        scheme="mxfp8",
+    )
+    autoround.quantize()
+    forbid_threaded_packing(monkeypatch, llmc_fp_export)
+    compressed_model = autoround.save_quantized(output_dir=tmp_path, format="llm_compressor")
+    tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
+    assert hasattr(tmp_layer, "weight_scale")
diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py
index fb8d51c84..e64621fc4 100644
--- a/test/test_cpu/quantization/test_mxfp_nvfp.py
+++ b/test/test_cpu/quantization/test_mxfp_nvfp.py
@@ -8,8 +8,9 @@
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
+from auto_round.export.export_to_autoround import export_to_nvfp_mxfp as autoround_nvfp_mxfp_export
 
-from ...helpers import is_model_outputs_similar, transformers_version
+from ...helpers import forbid_threaded_packing, is_model_outputs_similar, transformers_version
 
 
 def _get_folder_size(path: str) -> float:
@@ -292,9 +293,8 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader):
             and tmp_layer.weight_scale.shape[0] == 768
         ), "Illegal NVFP4 packing name or data_type or shape"
 
-    def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader):
+    def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader, monkeypatch):
         model_name = tiny_opt_model_path
-        from transformers import AutoConfig
 
         scheme = "NVFP4"
         autoround = AutoRound(
@@ -306,6 +306,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
+        forbid_threaded_packing(monkeypatch, autoround_nvfp_mxfp_export)
         compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (

From a6845a36c70355c2564c0bd2efa2d4f73e2b23e5 Mon Sep 17 00:00:00 2001
From: Liang Lv <liang1.lv@intel.com>
Date: Fri, 10 Apr 2026 14:21:12 +0800
Subject: [PATCH 5/5] Fix omni model test CI issue (#1667)

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Haihao Shen <haihao.shen@intel.com>
---
 test/test_cpu/models/test_omni_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_cpu/models/test_omni_model.py b/test/test_cpu/models/test_omni_model.py
index 2c7968752..136f19b31 100644
--- a/test/test_cpu/models/test_omni_model.py
+++ b/test/test_cpu/models/test_omni_model.py
@@ -263,6 +263,7 @@ def test_weight_fidelity(self):
         """Test that unfused weights match original fused weights."""
         from auto_round.modeling.fused_moe.replace_modules import apply_replacements, materialize_model_
 
+        torch.manual_seed(42)
         config = _make_tiny_qwen3_omni_moe_config()
         model = Qwen3OmniMoeForConditionalGeneration(config)