Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7c5fe1b
refactor autoround
Kaihui-intel Dec 3, 2025
e32457d
add test_static_afp8_export&test_static_afp8_export
Kaihui-intel Dec 3, 2025
c867d0c
fix key
Kaihui-intel Dec 3, 2025
ce114a4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 3, 2025
45c8eff
fix key
Kaihui-intel Dec 3, 2025
81d1b8f
support set_local
Kaihui-intel Dec 3, 2025
7c22ed8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 3, 2025
46516ce
update ar version v0.9.2rc
Kaihui-intel Dec 4, 2025
1538c91
merge main
Kaihui-intel Dec 4, 2025
6889568
remove transformers ut
Kaihui-intel Dec 4, 2025
6e1f6d3
fix target_bits ut
Kaihui-intel Dec 4, 2025
2f06c93
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 4, 2025
b7b6ab0
update xin comments
Kaihui-intel Dec 4, 2025
0d90f55
update ar v0.9.3rc and ut
Kaihui-intel Dec 4, 2025
6dd63c3
split autoround from woq
Kaihui-intel Dec 4, 2025
db3320b
update import
Kaihui-intel Dec 4, 2025
5748b2e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 4, 2025
84750ef
update layer config keys
Kaihui-intel Dec 4, 2025
aad1f4c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 4, 2025
f8761bd
Merge branch 'kaihui/ar_refactor' of https://github.com/intel/neural-…
Kaihui-intel Dec 4, 2025
d028006
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 4, 2025
395fc04
move test_autoround.py
Kaihui-intel Dec 4, 2025
77a1572
Merge branch 'kaihui/ar_refactor' of https://github.com/intel/neural-…
Kaihui-intel Dec 4, 2025
537f8b2
add reloading
Kaihui-intel Dec 5, 2025
c8617d4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 5, 2025
4f95ac5
add reload in ut
Kaihui-intel Dec 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions neural_compressor/torch/algorithms/autoround/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2024 Intel Corporation

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The AutoRound-related modules."""

from .autoround import *
Original file line number Diff line number Diff line change
Expand Up @@ -41,78 +41,16 @@ def _is_auto_round_available():

from neural_compressor.common.utils import Statistics
from neural_compressor.torch.algorithms import Quantizer
from neural_compressor.torch.algorithms.weight_only.utility import CapturedDataloader, InputCaptureModule
from neural_compressor.torch.utils import get_accelerator, logger

from .utility import CapturedDataloader, InputCaptureModule


class AutoRoundQuantizer(Quantizer):
"""AutoRound Quantizer."""

def __init__(
self,
bits: int = None,
group_size: int = None,
sym: bool = None,
data_type: str = None,
act_bits: int = None,
act_group_size: int = None,
act_sym: bool = None,
act_data_type: str = None,
act_dynamic: bool = None,
super_bits: int = None,
super_group_size: int = None,
quant_config: dict = {}, # for INC
layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
enable_full_range: bool = False, ##for symmetric, TODO support later
batch_size: int = 8,
amp: bool = True,
device_map: str = None,
quant_lm_head: bool = False,
lr_scheduler=None,
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
enable_quanted_input: bool = True,
enable_minmax_tuning: bool = True,
lr: float = None,
minmax_lr: float = None,
low_gpu_mem_usage: bool = False,
iters: int = 200,
seqlen: int = 2048,
nsamples: int = 128,
sampler: str = "rand",
seed: int = 42,
nblocks: int = 1,
gradient_accumulate_steps: int = 1,
not_use_best_mse: bool = False,
dynamic_max_gap: int = -1,
scale_dtype: str = "fp16",
to_quant_block_names: list = None,
low_cpu_mem_usage: bool = False,
export_format: str = "itrex",
# v0.4
enable_norm_bias_tuning: bool = False,
enable_torch_compile: bool = None,
# mllm
quant_nontext_module: bool = False,
extra_data_dir: str = None,
image_processor=None,
processor=None,
template: Union[str, Template] = None,
truncation: bool = False,
# 0.7
scheme: Union[str, dict, QuantizationScheme] = "W4A16",
# diffusion
guidance_scale: float = 7.5,
num_inference_steps: int = 50,
generator_seed: int = None,
# 0.9
target_bits: int = None,
options: Union[str, list[Union[str]], tuple[Union[str], ...]] = ("MXFP4", "MXFP8"),
shared_layers: Optional[Iterable[Iterable[str]]] = None,
ignore_scale_zp_bits: bool = False,
auto_scheme_method: str = "default",
auto_scheme_batch_size: int = None,
auto_scheme_device_map: str = None,
quant_config: Optional[dict] = None,
**kwargs,
):
"""Init a AutQRoundQuantizer object.
Expand Down Expand Up @@ -193,71 +131,14 @@ def __init__(
Returns:
The quantized model.
"""
super().__init__(quant_config)
self.layer_config = layer_config
self.output_dir = kwargs.pop("output_dir", "temp_auto_round")
self.tokenizer = kwargs.pop("tokenizer", "Placeholder") # for AutoRound initialization
self.enable_full_range = enable_full_range
self.bits = bits
self.group_size = group_size
self.sym = sym
self.data_type = data_type
self.act_bits = act_bits
self.act_group_size = act_group_size
self.act_sym = act_sym
self.act_data_type = act_data_type
self.act_dynamic = act_dynamic
self.super_bits = super_bits
self.super_group_size = super_group_size
self.batch_size = batch_size
self.amp = amp
super().__init__(quant_config=quant_config)
for k, v in kwargs.items():
setattr(self, k, v)
self.accelerator = get_accelerator(kwargs.pop("device", "auto"))
self.device = self.accelerator.name()
self.lr_scheduler = lr_scheduler
self.dataset = dataset
self.enable_quanted_input = enable_quanted_input
self.enable_minmax_tuning = enable_minmax_tuning
self.lr = lr
self.minmax_lr = minmax_lr
self.low_gpu_mem_usage = low_gpu_mem_usage
self.iters = iters
self.seqlen = seqlen
self.nsamples = nsamples
self.sampler = sampler
self.seed = seed
self.nblocks = nblocks
self.gradient_accumulate_steps = gradient_accumulate_steps
self.not_use_best_mse = not_use_best_mse
self.dynamic_max_gap = dynamic_max_gap
self.scale_dtype = scale_dtype
self.to_quant_block_names = to_quant_block_names
self.low_cpu_mem_usage = low_cpu_mem_usage
self.export_format = export_format
self.enable_norm_bias_tuning = enable_norm_bias_tuning
self.enable_torch_compile = enable_torch_compile
self.quant_nontext_module = quant_nontext_module
self.extra_data_dir = extra_data_dir
self.processor = processor
self.image_processor = image_processor
self.template = template
self.truncation = truncation
self.scheme = scheme
self.device_map = device_map
self.quant_lm_head = quant_lm_head
self.enable_w4afp8 = self._is_w4afp8()
self.guidance_scale = guidance_scale
self.num_inference_steps = num_inference_steps
self.generator_seed = generator_seed
self.target_bits = target_bits
self.options = options
self.shared_layers = shared_layers
self.ignore_scale_zp_bits = ignore_scale_zp_bits
self.auto_scheme_method = auto_scheme_method
self.auto_scheme_batch_size = auto_scheme_batch_size
self.auto_scheme_device_map = auto_scheme_device_map

def _is_w4afp8(self) -> bool:
return any([v.get("data_type", None) == "fp8_to_int_sym" for v in self.quant_config.values()])
return self.data_type == "fp8_to_int_sym"

def prepare(self, model: torch.nn.Module, *args, **kwargs):
"""Prepares a given model for quantization.
Expand Down Expand Up @@ -290,7 +171,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
model = model.orig_model
if pipe is not None:
model = pipe
if self.target_bits is not None:
# Remove AutoRound specific args before passing to AutoRound constructor
keys_to_pop = ["quant_config", "device", "export_format", "output_dir", "accelerator", "reloading"]
if hasattr(self, "target_bits") and self.target_bits is not None:
from auto_round import AutoScheme

self.scheme = AutoScheme(
Expand All @@ -303,65 +186,28 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
device_map=self.auto_scheme_device_map,
low_gpu_mem_usage=self.low_gpu_mem_usage,
)
# Remove AutoRound specific AutoScheme args before passing to AutoRound constructor
keys_to_pop += [
"target_bits",
"options",
"shared_layers",
"ignore_scale_zp_bits",
"auto_scheme_method",
"auto_scheme_batch_size",
"auto_scheme_device_map",
]

rounder = AutoRound(
model,
layer_config=self.layer_config,
bits=self.bits,
data_type=self.data_type,
group_size=self.group_size,
sym=self.sym,
act_bits=self.act_bits,
act_group_size=self.act_group_size,
act_sym=self.act_sym,
act_data_type=self.act_data_type,
act_dynamic=self.act_dynamic,
super_bits=self.super_bits,
super_group_size=self.super_group_size,
tokenizer=tokenizer,
scheme=self.scheme,
processor=self.processor,
image_processor=self.image_processor,
enable_full_range=self.enable_full_range,
batch_size=self.batch_size,
amp=self.amp,
device_map=self.device_map,
lr_scheduler=self.lr_scheduler,
dataset=self.dataset,
extra_data_dir=self.extra_data_dir,
template=self.template,
quant_nontext_module=self.quant_nontext_module,
enable_quanted_input=self.enable_quanted_input,
enable_minmax_tuning=self.enable_minmax_tuning,
lr=self.lr,
minmax_lr=self.minmax_lr,
low_gpu_mem_usage=self.low_gpu_mem_usage,
low_cpu_mem_usage=self.low_gpu_mem_usage,
iters=self.iters,
seqlen=self.seqlen,
nsamples=self.nsamples,
sampler=self.sampler,
seed=self.seed,
nblocks=self.nblocks,
gradient_accumulate_steps=self.gradient_accumulate_steps,
not_use_best_mse=self.not_use_best_mse,
dynamic_max_gap=self.dynamic_max_gap,
scale_dtype=self.scale_dtype,
to_quant_block_names=self.to_quant_block_names,
enable_norm_bias_tuning=self.enable_norm_bias_tuning,
truncation=self.truncation,
enable_torch_compile=self.enable_torch_compile,
quant_lm_head=self.quant_lm_head,
guidance_scale=self.guidance_scale,
num_inference_steps=self.num_inference_steps,
generator_seed=self.generator_seed,
**{k: v for k, v in self.__dict__.items() if k not in keys_to_pop},
)

if self.enable_w4afp8:
if self._is_w4afp8():
model, weight_config = rounder.quantize()
model.autoround_config = weight_config
return rounder.save_quantized(output_dir=self.output_dir, inplace=True)
elif "itrex" in self.export_format:
elif "itrex" in self.export_format: # TODO: remove itrex related code later
model, weight_config = rounder.quantize()
model.autoround_config = weight_config
model = pack_model(model, weight_config, device=self.device, inplace=True)
Expand All @@ -373,10 +219,14 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
self.accelerator.empty_cache()
dump_model_op_stats(rounder.layer_config)

if self.export_format in ["auto_round", "llm_compressor"]:
reloading = self.__dict__.get("reloading", True)
if self.export_format in ["auto_round", "llm_compressor"] and reloading:
# the directly returned model is QuantLinear, which is used for packing.
try:
logger.info(f"Quantization is done, reloading model from saved directory({self.output_dir})...")
logger.info(
f"Quantization is done, reloading model from saved directory({self.output_dir})...\n"
"Set reloading=False to skip."
)
import transformers # pylint: disable=E0401

model = transformers.AutoModelForCausalLM.from_pretrained(self.output_dir)
Expand Down
Loading
Loading