From 11d3c7d4eec7e752958977ca5236e8086dd31eaa Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 16 Apr 2026 16:40:34 +0800 Subject: [PATCH 1/5] fp8_block bug fix (#1693) --- auto_round/export/export_to_autoround/export_to_fp8.py | 2 +- test/test_cpu/export/test_llmc_format.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index fadebba5d..38af5b7b5 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -175,7 +175,7 @@ def pack_layer(layer_name, model, data_type, device=None, unsqueeze=False): ) if ( unsqueeze - and isinstance(linear_cls, FP8QLinear) + and isinstance(my_linear, FP8QLinear) and len(my_linear.weight_scale.shape) and my_linear.weight_scale.shape[0] != 1 ): diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py index ebc1bd87c..02c062d7e 100644 --- a/test/test_cpu/export/test_llmc_format.py +++ b/test/test_cpu/export/test_llmc_format.py @@ -63,12 +63,16 @@ def test_llmcompressor_fp8(self): import json + from safetensors import safe_open config = json.load(open(os.path.join(self.save_dir, "config.json"))) assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel" assert config["quantization_config"]["quant_method"] == "compressed-tensors" + f = safe_open(os.path.join(self.save_dir, "model.safetensors"), framework="pt") + assert len(f.get_tensor("model.decoder.layers.0.fc1.weight_scale").shape) == 2 + def test_autoround_llmcompressor_fp8(self): ## quantize the model model_name = opt_name_or_path From 7ea88e70c0f051b7e870d719845e87678220e32e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Apr 2026 06:00:50 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/test_cpu/export/test_llmc_format.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py index 02c062d7e..75b852af3 100644 --- a/test/test_cpu/export/test_llmc_format.py +++ b/test/test_cpu/export/test_llmc_format.py @@ -64,6 +64,7 @@ def test_llmcompressor_fp8(self): import json from safetensors import safe_open + config = json.load(open(os.path.join(self.save_dir, "config.json"))) assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 From 38bbb223a21112a332d2fa9c6a47ea12d8bcfac3 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Fri, 24 Apr 2026 14:15:07 +0800 Subject: [PATCH 3/5] fix branch Signed-off-by: chensuyue --- .azure-pipelines/code-scan.yml | 2 +- .azure-pipelines/compatibility-test.yml | 2 +- .azure-pipelines/performance-test.yaml | 2 +- .azure-pipelines/unit-test-cuda.yml | 2 +- .azure-pipelines/unit-test-hpu.yml | 2 +- .azure-pipelines/unit-test-xpu.yml | 2 +- .azure-pipelines/unit-test.yml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.azure-pipelines/code-scan.yml b/.azure-pipelines/code-scan.yml index 028a0d086..e18414b0f 100644 --- a/.azure-pipelines/code-scan.yml +++ b/.azure-pipelines/code-scan.yml @@ -6,7 +6,7 @@ pr: branches: include: - main - - 'v*.rc' + - 'v*rc' paths: include: - auto_round diff --git a/.azure-pipelines/compatibility-test.yml b/.azure-pipelines/compatibility-test.yml index 5bc47f5e3..b314d2c8b 100644 --- a/.azure-pipelines/compatibility-test.yml +++ b/.azure-pipelines/compatibility-test.yml @@ -6,7 +6,7 @@ pr: branches: include: - main - - 'v*.rc' + - 'v*rc' paths: include: - auto_round diff --git a/.azure-pipelines/performance-test.yaml b/.azure-pipelines/performance-test.yaml index 694c38162..1cc5a9ca8 100644 --- a/.azure-pipelines/performance-test.yaml +++ b/.azure-pipelines/performance-test.yaml @@ -6,7 +6,7 @@ pr: branches: include: - main - - 'v*.rc' + - 'v*rc' paths: include: - auto_round diff --git a/.azure-pipelines/unit-test-cuda.yml b/.azure-pipelines/unit-test-cuda.yml index 9b7c13cb6..a9040db6f 100644 --- a/.azure-pipelines/unit-test-cuda.yml +++ b/.azure-pipelines/unit-test-cuda.yml @@ -6,7 +6,7 @@ pr: branches: include: - main - - 'v*.rc' + - 'v*rc' paths: include: - auto_round diff --git a/.azure-pipelines/unit-test-hpu.yml b/.azure-pipelines/unit-test-hpu.yml index d9e0a263e..5da27231f 100644 --- a/.azure-pipelines/unit-test-hpu.yml +++ b/.azure-pipelines/unit-test-hpu.yml @@ -6,7 +6,7 @@ pr: branches: include: - main - - 'v*.rc' + - 'v*rc' paths: include: - auto_round diff --git a/.azure-pipelines/unit-test-xpu.yml b/.azure-pipelines/unit-test-xpu.yml index 5d9b081e3..e90536ca1 100644 --- a/.azure-pipelines/unit-test-xpu.yml +++ b/.azure-pipelines/unit-test-xpu.yml @@ -6,7 +6,7 @@ pr: branches: include: - main - - 'v*.rc' + - 'v*rc' paths: include: - auto_round diff --git a/.azure-pipelines/unit-test.yml b/.azure-pipelines/unit-test.yml index bd28294a2..1d4536b16 100644 --- a/.azure-pipelines/unit-test.yml +++ b/.azure-pipelines/unit-test.yml @@ -6,7 +6,7 @@ pr: branches: include: - main - - 'v*.rc' + - 'v*rc' paths: include: - auto_round From a10c1aa8840f889cb3812a8ed1736b27c8ed7c3b Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 24 Apr 2026 08:48:21 +0000 Subject: [PATCH 4/5] Backport remove threaded packing from exporters --- .../export/export_to_autogptq/export.py | 19 +------ .../export/export_to_autoround/export.py | 18 +------ .../export_to_autoround/export_to_fp8.py | 18 +------ .../export_to_nvfp_mxfp.py | 18 +------ auto_round/export/export_to_awq/export.py | 18 +------ .../export_to_llmcompressor/export_to_fp.py | 18 +------ .../export_to_static_fp.py | 24 +-------- requirements-hpu.txt | 1 - requirements.txt | 1 - test/helpers.py | 13 +++++ test/test_cpu/export/test_export.py | 49 ++++++++++++++++++- test/test_cpu/export/test_llmc_format.py | 32 +++++++++++- test/test_cpu/quantization/test_mxfp_nvfp.py | 7 +-- 13 files changed, 110 insertions(+), 126 deletions(-) diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index 98fde7b73..5594b5979 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -16,12 +16,9 @@ import inspect import json import os -from concurrent.futures import ThreadPoolExecutor from dataclasses import fields from typing import Any, Callable, Dict, Union -import threadpoolctl as tctl - # MIT License # # Copyright (c) 2023 潘其威(William) @@ -295,21 +292,9 @@ def save_quantized_as_autogptq( model = copy.deepcopy(model.to("cpu")) names = list(layer_config.keys()) - max_workers = 1 - if not torch.cuda.is_available() and not torch.xpu.is_available(): - max_workers = 2 ## 2 with cuda packing will cause hang occasionally if not unsupported_meta_device(model): - with ThreadPoolExecutor(max_workers=max_workers) as executor: - with tqdm(total=len(names), leave=True) as pbar: - - def wrapper(name): - pbar.set_description(f"packing {name}") - with tctl.threadpool_limits(limits=1): - pack_layer(name, model, backend, device) - pbar.update(1) - - for _ in executor.map(wrapper, names): - pass + for name in tqdm(names, desc="packing", leave=True): + pack_layer(name, model, backend, device) if output_dir is None: return model quantization_config["lm_head"] = lm_head_quantized diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 372638e99..60613b7e8 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -18,12 +18,10 @@ import inspect import json import os -from concurrent.futures import ThreadPoolExecutor from dataclasses import fields from enum import Enum from typing import Callable, Union -import threadpoolctl as tctl import torch import torch.nn as nn import transformers @@ -309,21 +307,9 @@ def save_quantized_as_autoround( quantization_config["extra_config"] = extra_config names = list(layer_config.keys()) - max_workers = 1 - if not torch.cuda.is_available() and not torch.xpu.is_available(): - max_workers = 2 ## 2 with cuda packing will cause hang occasionally if not unsupported_meta_device(model): - with ThreadPoolExecutor(max_workers=max_workers) as executor: - with tqdm(total=len(names), leave=True) as pbar: - - def wrapper(name): - pbar.set_description(f"packing {name}") - with tctl.threadpool_limits(limits=1): - pack_layer(name, model, backend, device) - pbar.update(1) - - for _ in executor.map(wrapper, names): - pass + for name in tqdm(names, desc="packing", leave=True): + pack_layer(name, model, backend, device) filter_quantization_config(quantization_config) if hasattr(model, "config"): model.config.quantization_config = quantization_config diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 38af5b7b5..90c228583 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -15,11 +15,9 @@ import copy import json import os -from concurrent.futures import ThreadPoolExecutor from dataclasses import fields from typing import Callable, Union -import threadpoolctl as tctl import torch import transformers from tqdm import tqdm @@ -240,20 +238,8 @@ def save_quantized_as_autoround( if len(extra_config) > 0: quantization_config["extra_config"] = extra_config names = list(layer_config.keys()) - max_workers = 1 - if not torch.cuda.is_available() and not torch.xpu.is_available(): - max_workers = 2 ## 2 with cuda packing will cause hang occasionally - with ThreadPoolExecutor(max_workers=max_workers) as executor: - with tqdm(total=len(names), leave=True) as pbar: - - def wrapper(name): - pbar.set_description(f"packing {name}") - with tctl.threadpool_limits(limits=1): - pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device) - pbar.update(1) - - for _ in executor.map(wrapper, names): - pass + for name in tqdm(names, desc="packing", leave=True): + pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device) regex_config = quantization_config.pop("regex_config") if regex_config is not None: for name in regex_config.keys(): diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py index 502c49676..f3f18cf03 100644 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py @@ -16,11 +16,9 @@ import inspect import json import os -from concurrent.futures import ThreadPoolExecutor from dataclasses import fields from typing import Callable, Union -import threadpoolctl as tctl import torch import torch.nn as nn import transformers @@ -235,20 +233,8 @@ def save_quantized_as_fp( if len(extra_config) > 0: quantization_config["extra_config"] = extra_config names = list(layer_config.keys()) - max_workers = 1 - if not torch.cuda.is_available() and not torch.xpu.is_available(): - max_workers = 2 ## 2 with cuda packing will cause hang occasionally - with ThreadPoolExecutor(max_workers=max_workers) as executor: - with tqdm(total=len(names), leave=True) as pbar: - - def wrapper(name): - pbar.set_description(f"packing {name}") - with tctl.threadpool_limits(limits=1): - pack_layer(name, model, backend, device) - pbar.update(1) - - for _ in executor.map(wrapper, names): - pass + for name in tqdm(names, desc="packing", leave=True): + pack_layer(name, model, backend, device) filter_quantization_config(quantization_config) if hasattr(model, "config"): diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py index 1fedf3c60..cc178097a 100644 --- a/auto_round/export/export_to_awq/export.py +++ b/auto_round/export/export_to_awq/export.py @@ -23,10 +23,8 @@ import copy import json import os -from concurrent.futures import ThreadPoolExecutor from typing import Callable, Union -import threadpoolctl as tctl import torch import torch.nn as nn from tqdm import tqdm @@ -179,21 +177,9 @@ def save_quantized_as_autoawq( names = list(layer_config.keys()) backend = None - max_workers = 1 - if not torch.cuda.is_available() and not torch.xpu.is_available(): - max_workers = 2 ## 2 with cuda packing will cause hang occasionally if not unsupported_meta_device(model): - with ThreadPoolExecutor(max_workers=max_workers) as executor: - with tqdm(total=len(names), leave=True) as pbar: - - def wrapper(name): - pbar.set_description(f"packing {name}") - with tctl.threadpool_limits(limits=1): - pack_layer(name, compressed_model, backend, device) - pbar.update(1) - - for _ in executor.map(wrapper, names): - pass + for name in tqdm(names, desc="packing", leave=True): + pack_layer(name, compressed_model, backend, device) if output_dir is None: return model diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py index 05c01ee3a..b2a017189 100644 --- a/auto_round/export/export_to_llmcompressor/export_to_fp.py +++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py @@ -16,10 +16,8 @@ import inspect import json import os -from concurrent.futures import ThreadPoolExecutor from typing import Callable, Union -import threadpoolctl as tctl import torch import torch.nn as nn import transformers @@ -190,21 +188,9 @@ def save_quantized_as_fp( update_fused_layer_global_scales(module, base_name="input") names = list(layer_config.keys()) - max_workers = 1 - if not torch.cuda.is_available() or not torch.xpu.is_available(): - max_workers = 2 ## 2 with cuda packing will cause hang occasionally if not unsupported_meta_device(model): - with ThreadPoolExecutor(max_workers=max_workers) as executor: - with tqdm(total=len(names), leave=True) as pbar: - - def wrapper(name): - pbar.set_description(f"packing {name}") - with tctl.threadpool_limits(limits=1): - pack_layer(name, model, device) - pbar.update(1) - - for _ in executor.map(wrapper, names): - pass + for name in tqdm(names, desc="packing", leave=True): + pack_layer(name, model, device) ignore = generate_ignore_regex_list(regex_config=regex_config, layer_config=layer_config) diff --git a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py index 6c534f8fc..c733c4510 100644 --- a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py +++ b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py @@ -16,10 +16,8 @@ import json import os import sys -from concurrent.futures import ThreadPoolExecutor from typing import Callable, Union -import threadpoolctl as tctl import torch import transformers from tqdm import tqdm @@ -36,7 +34,6 @@ get_module, get_packing_device, is_gaudi2, - is_hpex_available, logger, set_module, unsupported_meta_device, @@ -151,26 +148,9 @@ def save_quantized_as_static_fp( image_processor = kwargs.get("image_processor", None) names = list(layer_config.keys()) - max_workers = 1 - if not torch.cuda.is_available() and not torch.xpu.is_available(): - max_workers = 2 ## 2 with cuda packing will cause hang occasionally if not unsupported_meta_device(model): - - if is_hpex_available(): # packing will cause hang occasionally on hpu - for name in tqdm(names, total=len(names), leave=True, desc="packing"): - pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device) - else: - with ThreadPoolExecutor(max_workers=max_workers) as executor: - with tqdm(total=len(names), leave=True) as pbar: - - def wrapper(name): - pbar.set_description(f"packing {name}") - with tctl.threadpool_limits(limits=1): - pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device) - pbar.update(1) - - for _ in executor.map(wrapper, names): - pass + for name in tqdm(names, desc="packing", leave=True): + pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device) # Get llm-compressor format config check_compressed_tensors_supported() diff --git a/requirements-hpu.txt b/requirements-hpu.txt index d2704fd1a..0fbe967a4 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -7,4 +7,3 @@ tqdm packaging pillow transformers -threadpoolctl diff --git a/requirements.txt b/requirements.txt index b7abf7181..1bb587b64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ accelerate datasets numpy py-cpuinfo -threadpoolctl torch tqdm transformers>=4.38 diff --git a/test/helpers.py b/test/helpers.py index a45c953f5..f3872060a 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -1,3 +1,4 @@ +import concurrent.futures import copy import os import re @@ -14,6 +15,18 @@ transformers_version = version.parse(transformers.__version__) +def _raise_threaded_packing(*args, **kwargs): + raise AssertionError("Packing should not create a thread pool or call threadpoolctl.") + + +def forbid_threaded_packing(monkeypatch, module): + monkeypatch.setattr(concurrent.futures, "ThreadPoolExecutor", _raise_threaded_packing) + monkeypatch.setattr(module, "ThreadPoolExecutor", _raise_threaded_packing, raising=False) + tctl = getattr(module, "tctl", None) + if tctl is not None: + monkeypatch.setattr(tctl, "threadpool_limits", _raise_threaded_packing) + + def generate_prompt(model_obj_or_str, tokenizer=None, text="The capital of France is,", max_new_tokens=10, device=None): """Generate text using a model and tokenizer. diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py index 2ef2827b2..7fe8a6140 100644 --- a/test/test_cpu/export/test_export.py +++ b/test/test_cpu/export/test_export.py @@ -8,8 +8,12 @@ from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound +from auto_round.export.export_to_autogptq import export as autogptq_export +from auto_round.export.export_to_autoround import export as autoround_export +from auto_round.export.export_to_autoround import export_to_fp8 as autoround_fp8_export +from auto_round.export.export_to_awq import export as awq_export -from ...helpers import get_model_path, opt_name_or_path, transformers_version +from ...helpers import forbid_threaded_packing, get_model_path, opt_name_or_path, transformers_version def _get_folder_size(path: str) -> float: @@ -496,3 +500,46 @@ def test_llmc_dynamic_wint8aint8_export_with_tuning(self, dataloader): with safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") as f: assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.int8 + + +@pytest.mark.parametrize( + "format_name,export_module,sym", + [ + ("auto_gptq", autogptq_export, False), + ("auto_awq", awq_export, False), + ("auto_round", autoround_export, True), + ], +) +def test_weight_only_exports_pack_serially(tiny_opt_model_path, tmp_path, monkeypatch, format_name, export_module, sym): + autoround = AutoRound( + tiny_opt_model_path, + bits=4, + group_size=128, + sym=sym, + iters=0, + disable_opt_rtn=True, + ) + autoround.quantize() + forbid_threaded_packing(monkeypatch, export_module) + autoround.save_quantized(output_dir=tmp_path, inplace=False, format=format_name) + assert os.path.exists(os.path.join(tmp_path, "config.json")) + + +def test_fp8_autoround_export_packs_serially(tiny_opt_model_path, tmp_path, monkeypatch): + from safetensors import safe_open + + autoround = AutoRound( + tiny_opt_model_path, + bits=8, + group_size=-1, + iters=0, + scheme="FP8_STATIC", + nsamples=2, + seqlen=2, + static_kv_dtype="fp8", + ) + autoround.quantize() + forbid_threaded_packing(monkeypatch, autoround_fp8_export) + autoround.save_quantized(output_dir=tmp_path, format="auto_round") + with safe_open(os.path.join(tmp_path, "model.safetensors"), framework="pt") as f: + assert "model.decoder.layers.0.self_attn.k_proj.weight_scale" in f.keys() diff --git a/test/test_cpu/export/test_llmc_format.py b/test/test_cpu/export/test_llmc_format.py index 75b852af3..725cb6b05 100644 --- a/test/test_cpu/export/test_llmc_format.py +++ b/test/test_cpu/export/test_llmc_format.py @@ -6,8 +6,10 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from auto_round.export.export_to_llmcompressor import export_to_fp as llmc_fp_export +from auto_round.export.export_to_llmcompressor import export_to_static_fp as llmc_static_fp_export -from ...helpers import get_model_path, opt_name_or_path +from ...helpers import forbid_threaded_packing, get_model_path, opt_name_or_path class TestLLMC: @@ -95,3 +97,31 @@ def test_autoround_llmcompressor_fp8(self): assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor" assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"] == "tensor" assert config["quantization_config"]["quant_method"] == "compressed-tensors" + + +def test_llmcompressor_static_fp_export_packs_serially(tiny_opt_model_path, tmp_path, monkeypatch): + autoround = AutoRound( + tiny_opt_model_path, + scheme="FP8_STATIC", + seqlen=8, + nsamples=2, + iters=0, + ) + autoround.quantize() + forbid_threaded_packing(monkeypatch, llmc_static_fp_export) + autoround.save_quantized(tmp_path, format="llm_compressor") + assert os.path.exists(os.path.join(tmp_path, "config.json")) + + +def test_llmcompressor_mxfp8_export_packs_serially(tmp_path, monkeypatch): + autoround = AutoRound( + model=opt_name_or_path, + iters=0, + disable_opt_rtn=True, + scheme="mxfp8", + ) + autoround.quantize() + forbid_threaded_packing(monkeypatch, llmc_fp_export) + compressed_model = autoround.save_quantized(output_dir=tmp_path, format="llm_compressor") + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj + assert hasattr(tmp_layer, "weight_scale") diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py index fb8d51c84..e64621fc4 100644 --- a/test/test_cpu/quantization/test_mxfp_nvfp.py +++ b/test/test_cpu/quantization/test_mxfp_nvfp.py @@ -8,8 +8,9 @@ from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound +from auto_round.export.export_to_autoround import export_to_nvfp_mxfp as autoround_nvfp_mxfp_export -from ...helpers import is_model_outputs_similar, transformers_version +from ...helpers import forbid_threaded_packing, is_model_outputs_similar, transformers_version def _get_folder_size(path: str) -> float: @@ -292,9 +293,8 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader): and tmp_layer.weight_scale.shape[0] == 768 ), "Illegal NVFP4 packing name or data_type or shape" - def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader): + def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader, monkeypatch): model_name = tiny_opt_model_path - from transformers import AutoConfig scheme = "NVFP4" autoround = AutoRound( @@ -306,6 +306,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir autoround.quantize() + forbid_threaded_packing(monkeypatch, autoround_nvfp_mxfp_export) compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( From a6845a36c70355c2564c0bd2efa2d4f73e2b23e5 Mon Sep 17 00:00:00 2001 From: Liang Lv Date: Fri, 10 Apr 2026 14:21:12 +0800 Subject: [PATCH 5/5] Fix omni model test CI issue (#1667) Signed-off-by: lvliang-intel Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Haihao Shen --- test/test_cpu/models/test_omni_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_cpu/models/test_omni_model.py b/test/test_cpu/models/test_omni_model.py index 2c7968752..136f19b31 100644 --- a/test/test_cpu/models/test_omni_model.py +++ b/test/test_cpu/models/test_omni_model.py @@ -263,6 +263,7 @@ def test_weight_fidelity(self): """Test that unfused weights match original fused weights.""" from auto_round.modeling.fused_moe.replace_modules import apply_replacements, materialize_model_ + torch.manual_seed(42) config = _make_tiny_qwen3_omni_moe_config() model = Qwen3OmniMoeForConditionalGeneration(config)