From a13bdf05f11ea6d64d8ca82fdca14bdfb6837902 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 13 Nov 2025 15:58:51 +0800 Subject: [PATCH 01/18] fix imatrix pad issue --- auto_round/data_type/int.py | 2 +- auto_round/data_type/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py index 699466dc8..996b2589d 100644 --- a/auto_round/data_type/int.py +++ b/auto_round/data_type/int.py @@ -71,7 +71,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5 imatrix = 1.0 else: imatrix = imatrix.reshape(1, -1) - + imatrix= reshape_pad_tensor_by_group_size(imatrix, group_size, val=1e-5)[0].view(1,-1) imatrix = imatrix.expand(tensor.numel() // imatrix.numel(), -1) imatrix = imatrix.reshape(tensor.shape) diff --git a/auto_round/data_type/utils.py b/auto_round/data_type/utils.py index 1bb53a14b..517f8342f 100644 --- a/auto_round/data_type/utils.py +++ b/auto_round/data_type/utils.py @@ -23,7 +23,7 @@ from auto_round.utils import logger -def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int): +def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int, val:float=0.0): """Reshapes and pads the tensor to ensure that it can be quantized in groups of `group_size`. This function adjusts the @@ -55,7 +55,7 @@ def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int): return data, orig_shape, pad_len else: pad_len = (data.shape[1] + group_size - 1) // group_size * group_size - data.shape[1] - data_new = torch.nn.functional.pad(data, (0, pad_len)) + data_new = torch.nn.functional.pad(data, (val, pad_len)) data_new = data_new.reshape(-1, group_size) return data_new, orig_shape, pad_len From 4e201998387a382619141fe99abde8928863891b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 08:01:11 +0000 Subject: [PATCH 02/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/data_type/int.py | 2 +- auto_round/data_type/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py index 996b2589d..8fc6f79a0 100644 --- a/auto_round/data_type/int.py +++ b/auto_round/data_type/int.py @@ -71,7 +71,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5 imatrix = 1.0 else: imatrix = imatrix.reshape(1, -1) - imatrix= reshape_pad_tensor_by_group_size(imatrix, group_size, val=1e-5)[0].view(1,-1) + imatrix = reshape_pad_tensor_by_group_size(imatrix, group_size, val=1e-5)[0].view(1, -1) imatrix = imatrix.expand(tensor.numel() // imatrix.numel(), -1) imatrix = imatrix.reshape(tensor.shape) diff --git a/auto_round/data_type/utils.py b/auto_round/data_type/utils.py index 517f8342f..4e458a669 100644 --- a/auto_round/data_type/utils.py +++ b/auto_round/data_type/utils.py @@ -23,7 +23,7 @@ from auto_round.utils import logger -def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int, val:float=0.0): +def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int, val: float = 0.0): """Reshapes and pads the tensor to ensure that it can be quantized in groups of `group_size`. This function adjusts the From 405bde72cc545fd6671e2fc7d08f9e70fd66377f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 13 Nov 2025 21:05:26 +0800 Subject: [PATCH 03/18] update --- auto_round/__main__.py | 7 ++ auto_round/compressors/base.py | 6 +- auto_round/data_type/gguf.py | 124 +++++++++++--------- auto_round/data_type/utils.py | 2 +- auto_round/export/export_to_awq/utils.py | 6 - auto_round/export/export_to_gguf/packing.py | 4 +- auto_round/utils/device.py | 16 +++ 7 files changed, 99 insertions(+), 66 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 76a8f73d1..1ddd07660 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -172,6 +172,12 @@ def __init__(self, *args, **kwargs): type=float, help="Learning rate specifically for min-max tuning. " "If None, uses the same value as --lr. ", ) + tuning.add_argument( + "--momentum", + default=0, + type=float, + help="", + ) tuning.add_argument( "--gradient_accumulate_steps", default=1, @@ -591,6 +597,7 @@ def tune(args): extra_config=extra_config, layer_config=layer_config, model_dtype=args.model_dtype, + momentum=args.momentum, ) model_name = args.model.rstrip("/") diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 2634769a1..2dc249517 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -152,6 +152,7 @@ def __init__( disable_opt_rtn: bool = False, seed: int = 42, low_cpu_mem_usage: bool = False, + momentum = 0.0, **kwargs, ): """Initialize AutoRound with quantization and tuning configuration. @@ -250,6 +251,7 @@ def __init__( self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES self.scale_dtype = convert_dtype_str2torch(scale_dtype) self.low_cpu_mem_usage = low_cpu_mem_usage + self.momentum = momentum if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") @@ -2625,10 +2627,10 @@ def _quantize_block( minmax_lr = torch.tensor(self.minmax_lr) if self.enable_minmax_tuning: optimizer = self.optimizer( - [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0 + [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0, momentum=self.momentum ) else: - optimizer = self.optimizer(round_params, lr=lr, weight_decay=0) + optimizer = self.optimizer(round_params, lr=lr, weight_decay=0,momentum=self.momentum) if len(round_params) + len(minmax_params) <= 0: dump_info = ( diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index f20c6c7a6..ab8cd01d2 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -17,6 +17,7 @@ from auto_round.data_type.register import register_dtype from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste +from auto_round.utils.device import clear_memory from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants from auto_round.logger import logger @@ -320,7 +321,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens @torch.no_grad() -def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None): +def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None,split_num=1): super_bits = 4 if bits == 2 else 6 super_group_size = 16 if bits == 2 else 8 group_size = 16 if bits == 2 else 32 @@ -348,6 +349,7 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri nstep=params["nstep"], use_mad=params["use_mad"], weights=quant_weights, + split_num=split_num ) scale = scale.to(scale_dtype) scale = torch.where(torch.abs(scale) < 1e-30, torch.zeros_like(scale), scale) @@ -446,10 +448,15 @@ def quant_tensor_gguf_asym_dq( orig_dtype = tensor.dtype maxq = 2**bits - 1 group_size = 16 if bits == 2 else 32 + if tensor.shape[-1] > 20000: # trick setting, for embedding and lm-head + split_num=16 + else: + split_num=1 tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) + tensor = tensor.to(torch.float32) if scale is None: - scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym(tensor, bits, scale_dtype, imatrix) + scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym(tensor, bits, scale_dtype, imatrix,split_num=split_num) inverse_scale = get_reciprocal(scale) int_w = torch.clamp(round_ste((tensor + wmin) * inverse_scale + v), 0, maxq) @@ -458,7 +465,62 @@ def quant_tensor_gguf_asym_dq( return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} -def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None): +def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8): + dtype = torch.float32 + data = data.to(dtype) + maxq = 2**bits - 1 + minq = 0 + weights = 1.0 if weights is None else weights.to(dtype) + + results_scale = [] + results_rmin = [] + chunk_size = (data.shape[0]+split_num-1)//split_num + for start in range(0, data.shape[0], chunk_size): + end = min(start + chunk_size, data.shape[0]) + chunk = data[start:end] + chunk_weights = weights if isinstance(weights, float) else weights[start:end] + + rmin = torch.min(chunk, dim=1, keepdim=True)[0] + rmax = torch.max(chunk, dim=1, keepdim=True)[0] + sum_w = torch.sum(chunk_weights, dim=1, keepdim=True) + sum_x = torch.sum(chunk_weights * chunk, dim=1, keepdim=True) + scale = (rmax - rmin) / (maxq - minq) + iscale = get_reciprocal(scale) + quant_data = torch.clamp(torch.round(iscale * (chunk - rmin)), minq, maxq) + diff = scale * quant_data + rmin - chunk + best_mad = torch.sum((chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=1, keepdim=True) + + for is_ in range(nstep): + factor = rrmin + rdelta * is_ + maxq - minq + scale_new = (rmax - rmin) / factor + iscale_new = get_reciprocal(scale_new) + quant_data_new = torch.clamp(torch.round(iscale_new * (chunk - rmin)), minq, maxq) + mul_weights_quant_data = chunk_weights * quant_data_new + sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True) + sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True) + sum_xl = torch.sum(mul_weights_quant_data * chunk, dim=-1, keepdim=True) + D = sum_w * sum_l2 - torch.pow(sum_l, 2) + this_scale = (sum_w * sum_xl - sum_x * sum_l) / D + this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D + this_min[this_min > 0] = 0 + this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0] + reverse_this_scale = get_reciprocal(this_scale) + quant_data = torch.clamp(torch.round(reverse_this_scale * (chunk - this_min)), minq, maxq) + diff = this_scale * quant_data + this_min - chunk + mad = torch.sum((chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=-1, keepdim=True) + idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0] + best_mad[idx_to_replace] = mad[idx_to_replace] + scale[idx_to_replace] = this_scale[idx_to_replace] + rmin[idx_to_replace] = this_min[idx_to_replace] + results_scale.append(scale.to(torch.float32)) + results_rmin.append(-rmin.to(torch.float32)) + if split_num>1: + clear_memory(device_list=[data.device]) + + return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0) + + +def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None,split_num=1): """Adapted from Llamacpp. Performs iterative weighted least squares quantization search. Args: @@ -473,57 +535,9 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u Returns: Tuple: (Optimal scale tensor, optimal minimum value tensor) """ - dtype = torch.float32 - data = data.to(dtype) - maxq = 2**bits - 1 - minq = 0 - weights = 1.0 if weights is None else weights.to(dtype) - - rmin = torch.min(data, dim=1, keepdim=True)[0] - rmax = torch.max(data, dim=1, keepdim=True)[0] - - sum_w = torch.sum(weights, dim=1, keepdim=True) - sum_x = torch.sum(weights * data, dim=1, keepdim=True) - - # scale = 1 / ((maxq - minq) / (rmax - rmin + 1e-8)) - scale = (rmax - rmin) / (maxq - minq) - iscale = get_reciprocal(scale) - # quant_data = torch.clamp(torch.round((maxq - minq) / (rmax - rmin + 1e-8) * (data - rmin)), minq, maxq) - quant_data = torch.clamp(torch.round(iscale * (data - rmin)), minq, maxq) - diff = scale * quant_data + rmin - data - - best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=1, keepdim=True) - - for is_ in range(nstep): - factor = rrmin + rdelta * is_ + maxq - minq - # iscale_new = factor / (rmax - rmin + 1e-8) - scale_new = (rmax - rmin) / factor - iscale_new = get_reciprocal(scale_new) - quant_data_new = torch.clamp(torch.round(iscale_new * (data - rmin)), minq, maxq) - - mul_weights_quant_data = weights * quant_data_new - sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True) - sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True) - sum_xl = torch.sum(mul_weights_quant_data * data, dim=-1, keepdim=True) - - D = sum_w * sum_l2 - torch.pow(sum_l, 2) - this_scale = (sum_w * sum_xl - sum_x * sum_l) / D - this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D - this_min[this_min > 0] = 0 - this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0] - reverse_this_scale = get_reciprocal(this_scale) - - quant_data = torch.clamp(torch.round(reverse_this_scale * (data - this_min)), minq, maxq) - diff = this_scale * quant_data + this_min - data - # diff = this_scale * quant_data_new + this_min - data - mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=-1, keepdim=True) - - idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0] - best_mad[idx_to_replace] = mad[idx_to_replace] - scale[idx_to_replace] = this_scale[idx_to_replace] - rmin[idx_to_replace] = this_min[idx_to_replace] - - return scale.to(torch.float32), -rmin.to(torch.float32) + return iterative_wls_quant_search_chunk(data=data, bits=bits, rrmin=rrmin, + rdelta=rdelta, nstep=nstep, use_mad=use_mad, + weights=weights, split_num=split_num) @torch.no_grad() @@ -550,7 +564,6 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype): return scale -# @register_dtype("rtn_int_sym_dq") def quant_tensor_gguf_sym_dq( tensor, @@ -566,7 +579,6 @@ def quant_tensor_gguf_sym_dq( Args: tensor: Tensor containing the tensor to be quantized bits: Number of bits for quantization (e.g., 2, 3, 4, 8) - group_size: Number of elements to share scale for quantization v: Rounding value perturbation min_scale: Minimum scale coefficient for tensor max_scale: Maximum scale coefficient for tensor diff --git a/auto_round/data_type/utils.py b/auto_round/data_type/utils.py index 517f8342f..ee834db20 100644 --- a/auto_round/data_type/utils.py +++ b/auto_round/data_type/utils.py @@ -55,7 +55,7 @@ def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int, val:fl return data, orig_shape, pad_len else: pad_len = (data.shape[1] + group_size - 1) // group_size * group_size - data.shape[1] - data_new = torch.nn.functional.pad(data, (val, pad_len)) + data_new = torch.nn.functional.pad(data, (0, pad_len), value=val) data_new = data_new.reshape(-1, group_size) return data_new, orig_shape, pad_len diff --git a/auto_round/export/export_to_awq/utils.py b/auto_round/export/export_to_awq/utils.py index 0052ec9b1..4a6a48efe 100644 --- a/auto_round/export/export_to_awq/utils.py +++ b/auto_round/export/export_to_awq/utils.py @@ -317,9 +317,3 @@ def extra_repr(self) -> str: self.group_size, ) - -def clear_memory(weight=None): - if weight is not None: - del weight - gc.collect() - torch.cuda.empty_cache() diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 4c64a75d5..05c15ef0a 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -16,7 +16,7 @@ import torch from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, K_SCALE_SIZE, QK_K -from auto_round.utils import get_reciprocal +from auto_round.utils import get_reciprocal, clear_memory GGML_QUANT_TYPE = {} @@ -59,6 +59,7 @@ def ggml_quant( blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original ) except Exception: + clear_memory() device = "cpu" blocks = blocks.to(device) scale = scale.to(device) if scale is not None else scale @@ -66,6 +67,7 @@ def ggml_quant( wmin = wmin.to(device) if wmin is not None else wmin d_scale = d_scale.to(device) if d_scale is not None else d_scale d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin + imatrix = imatrix.to(device) if imatrix is not None else imatrix new_data = quant_func( blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original ) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 2f63a3a2d..38de9a96f 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -1308,3 +1308,19 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None return sorted(devices) raise TypeError(f"Unsupported device_map type: {type(device_map)}") + + +def gpu_synchronize(devices): + def _gpu_synchronize(device): + if torch.cuda.is_available(): + torch.cuda.synchronize(device) + elif torch.xpu.is_available(): + torch.xpu.synchronize(device) + + if isinstance(devices,(list,tuple)): + for device in devices: + _gpu_synchronize(device) + else: + _gpu_synchronize(devices) + + From 886a6c85f3592282f51ca0fd62fd0acf6772ab89 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:06:41 +0000 Subject: [PATCH 04/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 9 ++-- auto_round/data_type/gguf.py | 51 ++++++++++++++------- auto_round/export/export_to_awq/utils.py | 1 - auto_round/export/export_to_gguf/packing.py | 2 +- auto_round/utils/device.py | 4 +- 5 files changed, 43 insertions(+), 24 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 2dc249517..25f3825c1 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -152,7 +152,7 @@ def __init__( disable_opt_rtn: bool = False, seed: int = 42, low_cpu_mem_usage: bool = False, - momentum = 0.0, + momentum=0.0, **kwargs, ): """Initialize AutoRound with quantization and tuning configuration. @@ -2627,10 +2627,13 @@ def _quantize_block( minmax_lr = torch.tensor(self.minmax_lr) if self.enable_minmax_tuning: optimizer = self.optimizer( - [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0, momentum=self.momentum + [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], + lr=lr, + weight_decay=0, + momentum=self.momentum, ) else: - optimizer = self.optimizer(round_params, lr=lr, weight_decay=0,momentum=self.momentum) + optimizer = self.optimizer(round_params, lr=lr, weight_decay=0, momentum=self.momentum) if len(round_params) + len(minmax_params) <= 0: dump_info = ( diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index ab8cd01d2..ed05349b9 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -17,11 +17,11 @@ from auto_round.data_type.register import register_dtype from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste -from auto_round.utils.device import clear_memory from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants from auto_round.logger import logger from auto_round.utils import get_reciprocal +from auto_round.utils.device import clear_memory @register_dtype("int_sym_dq") @@ -321,7 +321,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens @torch.no_grad() -def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None,split_num=1): +def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None, split_num=1): super_bits = 4 if bits == 2 else 6 super_group_size = 16 if bits == 2 else 8 group_size = 16 if bits == 2 else 32 @@ -349,7 +349,7 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri nstep=params["nstep"], use_mad=params["use_mad"], weights=quant_weights, - split_num=split_num + split_num=split_num, ) scale = scale.to(scale_dtype) scale = torch.where(torch.abs(scale) < 1e-30, torch.zeros_like(scale), scale) @@ -448,15 +448,17 @@ def quant_tensor_gguf_asym_dq( orig_dtype = tensor.dtype maxq = 2**bits - 1 group_size = 16 if bits == 2 else 32 - if tensor.shape[-1] > 20000: # trick setting, for embedding and lm-head - split_num=16 + if tensor.shape[-1] > 20000: # trick setting, for embedding and lm-head + split_num = 16 else: - split_num=1 + split_num = 1 tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) tensor = tensor.to(torch.float32) if scale is None: - scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym(tensor, bits, scale_dtype, imatrix,split_num=split_num) + scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym( + tensor, bits, scale_dtype, imatrix, split_num=split_num + ) inverse_scale = get_reciprocal(scale) int_w = torch.clamp(round_ste((tensor + wmin) * inverse_scale + v), 0, maxq) @@ -465,7 +467,9 @@ def quant_tensor_gguf_asym_dq( return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} -def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8): +def iterative_wls_quant_search_chunk( + data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8 +): dtype = torch.float32 data = data.to(dtype) maxq = 2**bits - 1 @@ -474,7 +478,7 @@ def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep results_scale = [] results_rmin = [] - chunk_size = (data.shape[0]+split_num-1)//split_num + chunk_size = (data.shape[0] + split_num - 1) // split_num for start in range(0, data.shape[0], chunk_size): end = min(start + chunk_size, data.shape[0]) chunk = data[start:end] @@ -488,7 +492,9 @@ def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep iscale = get_reciprocal(scale) quant_data = torch.clamp(torch.round(iscale * (chunk - rmin)), minq, maxq) diff = scale * quant_data + rmin - chunk - best_mad = torch.sum((chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=1, keepdim=True) + best_mad = torch.sum( + (chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=1, keepdim=True + ) for is_ in range(nstep): factor = rrmin + rdelta * is_ + maxq - minq @@ -507,20 +513,26 @@ def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep reverse_this_scale = get_reciprocal(this_scale) quant_data = torch.clamp(torch.round(reverse_this_scale * (chunk - this_min)), minq, maxq) diff = this_scale * quant_data + this_min - chunk - mad = torch.sum((chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=-1, keepdim=True) + mad = torch.sum( + (chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), + dim=-1, + keepdim=True, + ) idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0] best_mad[idx_to_replace] = mad[idx_to_replace] scale[idx_to_replace] = this_scale[idx_to_replace] rmin[idx_to_replace] = this_min[idx_to_replace] results_scale.append(scale.to(torch.float32)) results_rmin.append(-rmin.to(torch.float32)) - if split_num>1: + if split_num > 1: clear_memory(device_list=[data.device]) return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0) -def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None,split_num=1): +def iterative_wls_quant_search( + data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=1 +): """Adapted from Llamacpp. Performs iterative weighted least squares quantization search. Args: @@ -535,9 +547,16 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u Returns: Tuple: (Optimal scale tensor, optimal minimum value tensor) """ - return iterative_wls_quant_search_chunk(data=data, bits=bits, rrmin=rrmin, - rdelta=rdelta, nstep=nstep, use_mad=use_mad, - weights=weights, split_num=split_num) + return iterative_wls_quant_search_chunk( + data=data, + bits=bits, + rrmin=rrmin, + rdelta=rdelta, + nstep=nstep, + use_mad=use_mad, + weights=weights, + split_num=split_num, + ) @torch.no_grad() diff --git a/auto_round/export/export_to_awq/utils.py b/auto_round/export/export_to_awq/utils.py index 4a6a48efe..871e4287a 100644 --- a/auto_round/export/export_to_awq/utils.py +++ b/auto_round/export/export_to_awq/utils.py @@ -316,4 +316,3 @@ def extra_repr(self) -> str: self.w_bit, self.group_size, ) - diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 05c15ef0a..f54f78a02 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -16,7 +16,7 @@ import torch from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, K_SCALE_SIZE, QK_K -from auto_round.utils import get_reciprocal, clear_memory +from auto_round.utils import clear_memory, get_reciprocal GGML_QUANT_TYPE = {} diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 38de9a96f..a281c4b46 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -1317,10 +1317,8 @@ def _gpu_synchronize(device): elif torch.xpu.is_available(): torch.xpu.synchronize(device) - if isinstance(devices,(list,tuple)): + if isinstance(devices, (list, tuple)): for device in devices: _gpu_synchronize(device) else: _gpu_synchronize(devices) - - From e2d7e704e501d5e5e8b910551dc4841f1540d914 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 13 Nov 2025 21:21:12 +0800 Subject: [PATCH 05/18] refine --- auto_round/compressors/base.py | 2 +- auto_round/data_type/gguf.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 2dc249517..68dfaec89 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2630,7 +2630,7 @@ def _quantize_block( [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0, momentum=self.momentum ) else: - optimizer = self.optimizer(round_params, lr=lr, weight_decay=0,momentum=self.momentum) + optimizer = self.optimizer(round_params, lr=lr, weight_decay=0, momentum=self.momentum) if len(round_params) + len(minmax_params) <= 0: dump_info = ( diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index ab8cd01d2..c77c3381f 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -448,10 +448,11 @@ def quant_tensor_gguf_asym_dq( orig_dtype = tensor.dtype maxq = 2**bits - 1 group_size = 16 if bits == 2 else 32 - if tensor.shape[-1] > 20000: # trick setting, for embedding and lm-head - split_num=16 - else: - split_num=1 + split_num = 1 + for dim in tensor.shape: + if dim > 100_000: + split_num = 16 + break tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) tensor = tensor.to(torch.float32) @@ -535,6 +536,7 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u Returns: Tuple: (Optimal scale tensor, optimal minimum value tensor) """ + # TODO this one should change to try catch later return iterative_wls_quant_search_chunk(data=data, bits=bits, rrmin=rrmin, rdelta=rdelta, nstep=nstep, use_mad=use_mad, weights=weights, split_num=split_num) From 21300752a15acc6cfc4a7403d5e2fe0fb18f474e Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 13 Nov 2025 21:24:15 +0800 Subject: [PATCH 06/18] clean --- auto_round/utils/device.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index a281c4b46..2f63a3a2d 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -1308,17 +1308,3 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None return sorted(devices) raise TypeError(f"Unsupported device_map type: {type(device_map)}") - - -def gpu_synchronize(devices): - def _gpu_synchronize(device): - if torch.cuda.is_available(): - torch.cuda.synchronize(device) - elif torch.xpu.is_available(): - torch.xpu.synchronize(device) - - if isinstance(devices, (list, tuple)): - for device in devices: - _gpu_synchronize(device) - else: - _gpu_synchronize(devices) From 9ecf7e6273435c11a06a13fddacb15235dc1afbe Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 13 Nov 2025 21:28:36 +0800 Subject: [PATCH 07/18] update --- auto_round/data_type/gguf.py | 99 ++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 10 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 913de1409..ecf60f280 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -460,7 +460,74 @@ def quant_tensor_gguf_asym_dq( qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len) return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} +def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None): + """Adapted from Llamacpp. Performs iterative weighted least squares quantization search. + Args: + data (torch.Tensor): Input tensor to quantize. + bits (int): Number of quantization bits. + rrmin (float): Initial range scaling factor. + rdelta (float): Step size for range scaling. + nstep (int): Number of search steps. + use_mad (bool): Whether to use mean absolute deviation instead of squared error. + weights (torch.Tensor): Weight matrix for each element. + + Returns: + Tuple: (Optimal scale tensor, optimal minimum value tensor) + """ + dtype = torch.float32 + data = data.to(dtype) + maxq = 2**bits - 1 + minq = 0 + weights = 1.0 if weights is None else weights.to(dtype) + + rmin = torch.min(data, dim=1, keepdim=True)[0] + rmax = torch.max(data, dim=1, keepdim=True)[0] + + sum_w = torch.sum(weights, dim=1, keepdim=True) + sum_x = torch.sum(weights * data, dim=1, keepdim=True) + + # scale = 1 / ((maxq - minq) / (rmax - rmin + 1e-8)) + scale = (rmax - rmin) / (maxq - minq) + iscale = get_reciprocal(scale) + # quant_data = torch.clamp(torch.round((maxq - minq) / (rmax - rmin + 1e-8) * (data - rmin)), minq, maxq) + quant_data = torch.clamp(torch.round(iscale * (data - rmin)), minq, maxq) + diff = scale * quant_data + rmin - data + + best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=1, keepdim=True) + + for is_ in range(nstep): + factor = rrmin + rdelta * is_ + maxq - minq + # iscale_new = factor / (rmax - rmin + 1e-8) + scale_new = (rmax - rmin) / factor + iscale_new = get_reciprocal(scale_new) + quant_data_new = torch.clamp(torch.round(iscale_new * (data - rmin)), minq, maxq) + + mul_weights_quant_data = weights * quant_data_new + sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True) + sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True) + sum_xl = torch.sum(mul_weights_quant_data * data, dim=-1, keepdim=True) + + D = sum_w * sum_l2 - torch.pow(sum_l, 2) + this_scale = (sum_w * sum_xl - sum_x * sum_l) / D + this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D + this_min[this_min > 0] = 0 + this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0] + reverse_this_scale = get_reciprocal(this_scale) + + quant_data = torch.clamp(torch.round(reverse_this_scale * (data - this_min)), minq, maxq) + diff = this_scale * quant_data + this_min - data + # diff = this_scale * quant_data_new + this_min - data + mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=-1, keepdim=True) + + idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0] + best_mad[idx_to_replace] = mad[idx_to_replace] + scale[idx_to_replace] = this_scale[idx_to_replace] + rmin[idx_to_replace] = this_min[idx_to_replace] + + return scale.to(torch.float32), -rmin.to(torch.float32) + +# TODO consolidate iterative_wls_quant_search_chunk and non-chunk def iterative_wls_quant_search_chunk( data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8 ): @@ -543,16 +610,28 @@ def iterative_wls_quant_search( """ # TODO this one should change to try catch later - return iterative_wls_quant_search_chunk( - data=data, - bits=bits, - rrmin=rrmin, - rdelta=rdelta, - nstep=nstep, - use_mad=use_mad, - weights=weights, - split_num=split_num, - ) + if split_num>1: + return iterative_wls_quant_search_chunk( + data=data, + bits=bits, + rrmin=rrmin, + rdelta=rdelta, + nstep=nstep, + use_mad=use_mad, + weights=weights, + split_num=split_num, + ) + else: + return iterative_wls_quant_search_non_chunk( + data=data, + bits=bits, + rrmin=rrmin, + rdelta=rdelta, + nstep=nstep, + use_mad=use_mad, + weights=weights, + ) + @torch.no_grad() From ea310ec2eed98881b0c6c22186d742d2b30cc165 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:29:32 +0000 Subject: [PATCH 08/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/data_type/gguf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index ecf60f280..577ccf34e 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -460,6 +460,7 @@ def quant_tensor_gguf_asym_dq( qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len) return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} + def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None): """Adapted from Llamacpp. Performs iterative weighted least squares quantization search. @@ -527,6 +528,7 @@ def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, n return scale.to(torch.float32), -rmin.to(torch.float32) + # TODO consolidate iterative_wls_quant_search_chunk and non-chunk def iterative_wls_quant_search_chunk( data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8 @@ -610,7 +612,7 @@ def iterative_wls_quant_search( """ # TODO this one should change to try catch later - if split_num>1: + if split_num > 1: return iterative_wls_quant_search_chunk( data=data, bits=bits, @@ -633,7 +635,6 @@ def iterative_wls_quant_search( ) - @torch.no_grad() def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype): from auto_round.export.export_to_gguf.config import K_SCALE_SIZE, QK_K From 967af5503f8e58c23012f4577dd77d644d9f470c Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 13 Nov 2025 21:37:25 +0800 Subject: [PATCH 09/18] update --- auto_round/compressors/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 25f3825c1..4e954a20a 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -152,7 +152,6 @@ def __init__( disable_opt_rtn: bool = False, seed: int = 42, low_cpu_mem_usage: bool = False, - momentum=0.0, **kwargs, ): """Initialize AutoRound with quantization and tuning configuration. @@ -194,7 +193,7 @@ def __init__( super_group_size, super_bits, scale_dtype ("fp16" etc.), nblocks, to_quant_block_names, enable_norm_bias_tuning, enable_quanted_input, - disable_deterministic_algorithms, mllm, static_kv_dtype,enable_deterministic_algorithms + disable_deterministic_algorithms, mllm, static_kv_dtype,enable_deterministic_algorithms,momentum Raises: ValueError: If invalid device is provided or tokenizer is missing for non-str model with iters > 0. RuntimeError: If model parameters are on meta device. @@ -235,6 +234,7 @@ def __init__( enable_quanted_input: bool = kwargs.pop("enable_quanted_input", True) disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", True) enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False) + self.momentum = kwargs.pop("momentum", 0.0) static_kv_dtype = kwargs.pop("static_kv_dtype", None) model_dtype = kwargs.pop("model_dtype", None) device = kwargs.pop("device", None) @@ -251,7 +251,7 @@ def __init__( self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES self.scale_dtype = convert_dtype_str2torch(scale_dtype) self.low_cpu_mem_usage = low_cpu_mem_usage - self.momentum = momentum + if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") From 356ee30eda423be84efee5d06422242b8c913690 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:38:55 +0000 Subject: [PATCH 10/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 4e954a20a..c4ad37bd4 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -252,7 +252,6 @@ def __init__( self.scale_dtype = convert_dtype_str2torch(scale_dtype) self.low_cpu_mem_usage = low_cpu_mem_usage - if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: From 5f4d85cc0655f7789f5174473d57806dbc168801 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 13 Nov 2025 21:42:14 +0800 Subject: [PATCH 11/18] Update auto_round/__main__.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 1ddd07660..19fef935a 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -176,7 +176,7 @@ def __init__(self, *args, **kwargs): "--momentum", default=0, type=float, - help="", + help="Momentum factor for the optimizer. Default is 0 (no momentum).", ) tuning.add_argument( "--gradient_accumulate_steps", From a9fe2113b86ae32d42915b7e05424933d3fe9110 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 14 Nov 2025 09:06:57 +0800 Subject: [PATCH 12/18] update --- auto_round/compressors/base.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 4e954a20a..2432551fc 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2625,15 +2625,25 @@ def _quantize_block( lr = torch.tensor(self.lr) minmax_lr = torch.tensor(self.minmax_lr) + is_adam = "adam" in self.__class__.__name__.lower() + + + extra_kwargs = {} if is_adam else {"momentum": self.momentum} + if self.enable_minmax_tuning: - optimizer = self.optimizer( - [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], - lr=lr, - weight_decay=0, - momentum=self.momentum, - ) + params = [ + {"params": round_params}, + {"params": minmax_params, "lr": minmax_lr}, + ] else: - optimizer = self.optimizer(round_params, lr=lr, weight_decay=0, momentum=self.momentum) + params = round_params + + optimizer = self.optimizer( + params, + lr=lr, + weight_decay=0, + **extra_kwargs, + ) if len(round_params) + len(minmax_params) <= 0: dump_info = ( From 63ae0c21c784fa13e26ba4c371f331387137b6e2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 01:08:06 +0000 Subject: [PATCH 13/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 6578b0f86..23ffd0df5 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2626,7 +2626,6 @@ def _quantize_block( minmax_lr = torch.tensor(self.minmax_lr) is_adam = "adam" in self.__class__.__name__.lower() - extra_kwargs = {} if is_adam else {"momentum": self.momentum} if self.enable_minmax_tuning: From 36d41af93061eef978540638574597f4a56f7444 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 14 Nov 2025 09:11:22 +0800 Subject: [PATCH 14/18] refine comments --- auto_round/compressors/base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c8d770dc2..00e7fded2 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1558,11 +1558,13 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: # It is best to modify the model structure in the quantize function and check the format, # because it may cause the gguf format to not be exported normally. self.model = _handle_moe_model(self.model, formats=formats) - # Assign temporary names after replacing modules - for n, m in self.model.named_modules(): # TODO check if could removed + + # Temporary names must be assigned after handle_moe_model; + # placing them earlier would cause them to be removed when the module is replaced. + for n, m in self.model.named_modules(): m.tmp_name = n - # TODO check scale_dtype + if not self.is_auto_scheme: enable_gguf_official_mixed = True else: From a3a19e2dd508384d8719449e3cd188a98ce5588b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Nov 2025 01:11:57 +0000 Subject: [PATCH 15/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 00e7fded2..daffe0acc 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1564,7 +1564,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: for n, m in self.model.named_modules(): m.tmp_name = n - if not self.is_auto_scheme: enable_gguf_official_mixed = True else: From c58403965bbcf53e510d3146dcf7e483217b5cab Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 14 Nov 2025 10:19:59 +0800 Subject: [PATCH 16/18] update readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 29eeae56f..acec0b3e3 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ See our [paper](https://arxiv.org/pdf/2309.05516) for more details. For usage in ## 🆕 What's New -[2025/11] AutoRound now offers preliminary support for an **enhanced GGUF quantization algorithm** via `--enable_alg_ext`. For detailed accuracy benchmarks, please refer to the accompanying [documentation](./docs/gguf_alg_ext_acc.md). +[2025/11] AutoRound now offers preliminary support for an enhanced GGUF quantization algorithm via `--enable_alg_ext`. For detailed accuracy benchmarks, please refer to the [documentation](./docs/gguf_alg_ext_acc.md). [2025/10] AutoRound has been integrated into **SGLang**. You can now run models in the AutoRound format directly using the latest SGLang later than v0.5.4. @@ -192,7 +192,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round") - **`layer_config` (dict)**: Configuration for weight quantization (default is `None`), mainly for mixed schemes. ##### Algorithm Settings -- **`enable_alg_ext` (bool)**: Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`. +- **`enable_alg_ext` (bool)**: [Experimental Feature] Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`. - **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled). ##### Tuning Process Parameters @@ -208,6 +208,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round") ##### Device/Speed Configuration - **`enable_torch_compile` (bool)**: If no exception is raised, typically we recommend setting it to True for faster quantization with lower resource. - **`low_gpu_mem_usage` (bool)**: Whether to offload intermediate features to CPU at the cost of ~20% more tuning time (default is `False`). +- **`low_cpu_mem_usage` (bool)**: [Experimental Feature]Whether to enable saving immediately to save ram usage (default is `False`). - **`device_map` (str|dict|int)**: The device to be used for tuning, e.g., `auto`, "cpu"`, `"cuda"`, `"0,1,2"` (default is `'0'`). When using "auto", it will try to use all available GPUs. From 267ff642ecf52f70ebd77a5183fc8bd2969280ed Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 14 Nov 2025 13:04:12 +0800 Subject: [PATCH 17/18] refine readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index acec0b3e3..598c3fd54 100644 --- a/README.md +++ b/README.md @@ -46,8 +46,7 @@ refer to the documentation for accuracy [results](./docs/auto_scheme_acc.md) and for some accuracy results. [2025/07] AutoRound now offers experimental support for **GGUF** format, and recommends using optimized RTN mode (--iters 0) for - all bits other than 3 bits. **A more advanced algorithm** tailored for specific configurations may be available in - v0.8.1. + all bits other than 3 bits. [2025/05] AutoRound has been integrated into **Transformers** and **vLLM**. From 0bc902fd5b99815e01d274a75e7e08a906c3f10c Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 14 Nov 2025 13:18:13 +0800 Subject: [PATCH 18/18] refine --- auto_round/export/export_to_gguf/packing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index f54f78a02..bc9189b7b 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -59,7 +59,6 @@ def ggml_quant( blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original ) except Exception: - clear_memory() device = "cpu" blocks = blocks.to(device) scale = scale.to(device) if scale is not None else scale @@ -68,6 +67,7 @@ def ggml_quant( d_scale = d_scale.to(device) if d_scale is not None else d_scale d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin imatrix = imatrix.to(device) if imatrix is not None else imatrix + clear_memory() new_data = quant_func( blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original )