intel · chensuyue · Apr 27, 2026 · Apr 16, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
@@ -16,12 +16,9 @@
 import inspect
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import fields
 from typing import Any, Callable, Dict, Union
 
-import threadpoolctl as tctl
-
 # MIT License
 #
 # Copyright (c) 2023 潘其威(William)
@@ -295,21 +292,9 @@ def save_quantized_as_autogptq(
         model = copy.deepcopy(model.to("cpu"))
 
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            with tqdm(total=len(names), leave=True) as pbar:
-
-                def wrapper(name):
-                    pbar.set_description(f"packing {name}")
-                    with tctl.threadpool_limits(limits=1):
-                        pack_layer(name, model, backend, device)
-                    pbar.update(1)
-
-                for _ in executor.map(wrapper, names):
-                    pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, model, backend, device)
     if output_dir is None:
         return model
     quantization_config["lm_head"] = lm_head_quantized

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
@@ -18,12 +18,10 @@
 import inspect
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import fields
 from enum import Enum
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 import transformers
@@ -309,21 +307,9 @@ def save_quantized_as_autoround(
         quantization_config["extra_config"] = extra_config
 
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            with tqdm(total=len(names), leave=True) as pbar:
-
-                def wrapper(name):
-                    pbar.set_description(f"packing {name}")
-                    with tctl.threadpool_limits(limits=1):
-                        pack_layer(name, model, backend, device)
-                    pbar.update(1)
-
-                for _ in executor.map(wrapper, names):
-                    pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, model, backend, device)
     filter_quantization_config(quantization_config)
     if hasattr(model, "config"):
         model.config.quantization_config = quantization_config

diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -15,11 +15,9 @@
 import copy
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import fields
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import transformers
 from tqdm import tqdm
@@ -175,7 +173,7 @@ def pack_layer(layer_name, model, data_type, device=None, unsqueeze=False):
     )
     if (
         unsqueeze
-        and isinstance(linear_cls, FP8QLinear)
+        and isinstance(my_linear, FP8QLinear)
         and len(my_linear.weight_scale.shape)
         and my_linear.weight_scale.shape[0] != 1
     ):
@@ -240,20 +238,8 @@ def save_quantized_as_autoround(
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        with tqdm(total=len(names), leave=True) as pbar:
-
-            def wrapper(name):
-                pbar.set_description(f"packing {name}")
-                with tctl.threadpool_limits(limits=1):
-                    pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
-                pbar.update(1)
-
-            for _ in executor.map(wrapper, names):
-                pass
+    for name in tqdm(names, desc="packing", leave=True):
+        pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
     regex_config = quantization_config.pop("regex_config")
     if regex_config is not None:
         for name in regex_config.keys():

diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -16,11 +16,9 @@
 import inspect
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from dataclasses import fields
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 import transformers
@@ -235,20 +233,8 @@ def save_quantized_as_fp(
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        with tqdm(total=len(names), leave=True) as pbar:
-
-            def wrapper(name):
-                pbar.set_description(f"packing {name}")
-                with tctl.threadpool_limits(limits=1):
-                    pack_layer(name, model, backend, device)
-                pbar.update(1)
-
-            for _ in executor.map(wrapper, names):
-                pass
+    for name in tqdm(names, desc="packing", leave=True):
+        pack_layer(name, model, backend, device)
     filter_quantization_config(quantization_config)
 
     if hasattr(model, "config"):

diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
@@ -23,10 +23,8 @@
 import copy
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 from tqdm import tqdm
@@ -179,21 +177,9 @@ def save_quantized_as_autoawq(
     names = list(layer_config.keys())
 
     backend = None
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            with tqdm(total=len(names), leave=True) as pbar:
-
-                def wrapper(name):
-                    pbar.set_description(f"packing {name}")
-                    with tctl.threadpool_limits(limits=1):
-                        pack_layer(name, compressed_model, backend, device)
-                    pbar.update(1)
-
-                for _ in executor.map(wrapper, names):
-                    pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, compressed_model, backend, device)
     if output_dir is None:
         return model
 

diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -16,10 +16,8 @@
 import inspect
 import json
 import os
-from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 import transformers
@@ -190,21 +188,9 @@ def save_quantized_as_fp(
             update_fused_layer_global_scales(module, base_name="input")
 
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() or not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            with tqdm(total=len(names), leave=True) as pbar:
-
-                def wrapper(name):
-                    pbar.set_description(f"packing {name}")
-                    with tctl.threadpool_limits(limits=1):
-                        pack_layer(name, model, device)
-                    pbar.update(1)
-
-                for _ in executor.map(wrapper, names):
-                    pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, model, device)
 
     ignore = generate_ignore_regex_list(regex_config=regex_config, layer_config=layer_config)
 

diff --git a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py
@@ -16,10 +16,8 @@
 import json
 import os
 import sys
-from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, Union
 
-import threadpoolctl as tctl
 import torch
 import transformers
 from tqdm import tqdm
@@ -36,7 +34,6 @@
     get_module,
     get_packing_device,
     is_gaudi2,
-    is_hpex_available,
     logger,
     set_module,
     unsupported_meta_device,
@@ -151,26 +148,9 @@ def save_quantized_as_static_fp(
     image_processor = kwargs.get("image_processor", None)
 
     names = list(layer_config.keys())
-    max_workers = 1
-    if not torch.cuda.is_available() and not torch.xpu.is_available():
-        max_workers = 2  ## 2 with cuda packing will cause hang occasionally
     if not unsupported_meta_device(model):
-
-        if is_hpex_available():  # packing will cause hang occasionally on hpu
-            for name in tqdm(names, total=len(names), leave=True, desc="packing"):
-                pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
-        else:
-            with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                with tqdm(total=len(names), leave=True) as pbar:
-
-                    def wrapper(name):
-                        pbar.set_description(f"packing {name}")
-                        with tctl.threadpool_limits(limits=1):
-                            pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
-                        pbar.update(1)
-
-                    for _ in executor.map(wrapper, names):
-                        pass
+        for name in tqdm(names, desc="packing", leave=True):
+            pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
 
     # Get llm-compressor format config
     check_compressed_tensors_supported()

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -7,4 +7,3 @@ tqdm
 packaging
 pillow
 transformers
-threadpoolctl
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,6 @@ accelerate
 datasets
 numpy
 py-cpuinfo
-threadpoolctl
 torch
 tqdm
 transformers>=4.38

diff --git a/test/helpers.py b/test/helpers.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import copy
 import os
 import re
@@ -14,6 +15,18 @@
 transformers_version = version.parse(transformers.__version__)
 
 
+def _raise_threaded_packing(*args, **kwargs):
+    raise AssertionError("Packing should not create a thread pool or call threadpoolctl.")
+
+
+def forbid_threaded_packing(monkeypatch, module):
+    monkeypatch.setattr(concurrent.futures, "ThreadPoolExecutor", _raise_threaded_packing)
+    monkeypatch.setattr(module, "ThreadPoolExecutor", _raise_threaded_packing, raising=False)
+    tctl = getattr(module, "tctl", None)
+    if tctl is not None:
+        monkeypatch.setattr(tctl, "threadpool_limits", _raise_threaded_packing)
+
+
 def generate_prompt(model_obj_or_str, tokenizer=None, text="The capital of France is,", max_new_tokens=10, device=None):
     """Generate text using a model and tokenizer.
 

diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py
@@ -8,8 +8,12 @@
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
+from auto_round.export.export_to_autogptq import export as autogptq_export
+from auto_round.export.export_to_autoround import export as autoround_export
+from auto_round.export.export_to_autoround import export_to_fp8 as autoround_fp8_export
+from auto_round.export.export_to_awq import export as awq_export
 
-from ...helpers import get_model_path, opt_name_or_path, transformers_version
+from ...helpers import forbid_threaded_packing, get_model_path, opt_name_or_path, transformers_version
 
 
 def _get_folder_size(path: str) -> float:
@@ -496,3 +500,46 @@ def test_llmc_dynamic_wint8aint8_export_with_tuning(self, dataloader):
         with safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") as f:
             assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
             assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.int8
+
+
+@pytest.mark.parametrize(
+    "format_name,export_module,sym",
+    [
+        ("auto_gptq", autogptq_export, False),
+        ("auto_awq", awq_export, False),
+        ("auto_round", autoround_export, True),
+    ],
+)
+def test_weight_only_exports_pack_serially(tiny_opt_model_path, tmp_path, monkeypatch, format_name, export_module, sym):
+    autoround = AutoRound(
+        tiny_opt_model_path,
+        bits=4,
+        group_size=128,
+        sym=sym,
+        iters=0,
+        disable_opt_rtn=True,
+    )
+    autoround.quantize()
+    forbid_threaded_packing(monkeypatch, export_module)
+    autoround.save_quantized(output_dir=tmp_path, inplace=False, format=format_name)
+    assert os.path.exists(os.path.join(tmp_path, "config.json"))
+
+
+def test_fp8_autoround_export_packs_serially(tiny_opt_model_path, tmp_path, monkeypatch):
+    from safetensors import safe_open
+
+    autoround = AutoRound(
+        tiny_opt_model_path,
+        bits=8,
+        group_size=-1,
+        iters=0,
+        scheme="FP8_STATIC",
+        nsamples=2,
+        seqlen=2,
+        static_kv_dtype="fp8",
+    )
+    autoround.quantize()
+    forbid_threaded_packing(monkeypatch, autoround_fp8_export)
+    autoround.save_quantized(output_dir=tmp_path, format="auto_round")
+    with safe_open(os.path.join(tmp_path, "model.safetensors"), framework="pt") as f:
+        assert "model.decoder.layers.0.self_attn.k_proj.weight_scale" in f.keys()