From 8ceb6a1d59209f62a96308cafcdf9e53568c5855 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 12:03:51 +0800 Subject: [PATCH 01/57] reduce vram --- auto_round/compressors/base.py | 14 +++++++++++--- auto_round/data_type/gguf.py | 12 ++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 7e9288f8d..b6ea8d40a 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1050,7 +1050,7 @@ def _get_save_folder_name(self, format_str: str) -> str: return self.orig_output_dir - @torch.inference_mode() + # @torch.inference_mode() def _quantize_embedding_layer(self): """Quantizes embedding layers in the model according to the configuration. @@ -1122,9 +1122,12 @@ def _quantize_embedding_layer(self): # Update config self.layer_config.setdefault(name, {}).update(config) + del weight + del scale + del zp + clear_memory(self.device_list) + - # Release memory - clear_memory(device_list=self.device_list) return is_quantized @@ -1354,10 +1357,14 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: has_gguf_k = ( any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None ) + if has_gguf_k: + self.model.to(torch.float32) self._quantize_embedding_layer() self.model.to("cpu") + # Release memory + clear_memory(device_list=self.device_list) enable_imatrix = False if not self.disable_opt_rtn: @@ -1628,6 +1635,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: logger.info("start to cache block inputs") all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names) is_quantized_embedding = self._quantize_embedding_layer() + clear_memory(device_list=self.device_list) all_q_inputs = None if is_quantized_embedding: all_inputs = copy.deepcopy(self.inputs) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 577ccf34e..1bf920a9c 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -408,6 +408,8 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri d_wmin = d_wmin.unsqueeze(-1) scale = (d_scale * q_scale).view(-1, 1) wmin = (d_wmin * q_wmin).view(-1, 1) + if split_num > 1: + clear_memory([tensor.device]) return scale, wmin, d_scale, d_wmin @@ -455,10 +457,12 @@ def quant_tensor_gguf_asym_dq( ) inverse_scale = get_reciprocal(scale) - int_w = torch.clamp(round_ste((tensor + wmin) * inverse_scale + v), 0, maxq) - qdq_result = (scale * int_w - wmin).to(orig_dtype) - qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len) - return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} + tensor = tensor.add_(wmin) + tensor = torch.round(tensor.mul_(inverse_scale)).clamp_(0,maxq) + tensor = tensor.mul_(scale) + tensor = tensor.subtract_(wmin).to(orig_dtype) + tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len) + return tensor, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None): From 97f460e4a57db5c70bedc17ef6ff11625794e87f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 04:05:11 +0000 Subject: [PATCH 02/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 2 -- auto_round/data_type/gguf.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index b6ea8d40a..544aea07c 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1127,8 +1127,6 @@ def _quantize_embedding_layer(self): del zp clear_memory(self.device_list) - - return is_quantized def _quant_rtn_with_imatrix(self, all_to_quantized_module_names: list[str]) -> None: diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 1bf920a9c..21c5a1e0d 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -458,7 +458,7 @@ def quant_tensor_gguf_asym_dq( inverse_scale = get_reciprocal(scale) tensor = tensor.add_(wmin) - tensor = torch.round(tensor.mul_(inverse_scale)).clamp_(0,maxq) + tensor = torch.round(tensor.mul_(inverse_scale)).clamp_(0, maxq) tensor = tensor.mul_(scale) tensor = tensor.subtract_(wmin).to(orig_dtype) tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len) From dd27f91a0ef512cc9c80322f7b89c7af3795b54e Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 14:46:48 +0800 Subject: [PATCH 03/57] update --- auto_round/compressors/base.py | 39 +++++---- auto_round/data_type/gguf.py | 73 ++++++++++++---- auto_round/export/export_to_gguf/packing.py | 97 ++++++++++++++++++++- auto_round/utils/common.py | 29 ++++-- auto_round/wrapper.py | 4 +- 5 files changed, 196 insertions(+), 46 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 544aea07c..3cd49771b 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1050,7 +1050,7 @@ def _get_save_folder_name(self, format_str: str) -> str: return self.orig_output_dir - # @torch.inference_mode() + @torch.inference_mode() def _quantize_embedding_layer(self): """Quantizes embedding layers in the model according to the configuration. @@ -1085,11 +1085,15 @@ def _quantize_embedding_layer(self): dtype = f"rtn_{dtype}" quant_func = QUANT_FUNC_WITH_DTYPE[dtype] + dtype = module.weight.dtype + # As typically float32 are used in RTN to search scale zp, to avoid cache a bf16 copy we'd better use float32 + if config["super_group_size"] is not None: + dtype = torch.float32 # Attempt quantization on GPU, fall back to CPU if OOM try: weight, scale, zp = quant_func( - module.weight.to(self.device), + module.weight.to(dtype).to(self.device), # **{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]}, ) except torch.OutOfMemoryError: @@ -1223,7 +1227,7 @@ def get_imatrix_hook(module, input, output): for hook in hooks: hook.remove() - def _quantize_layer_via_rtn(self, name: str) -> None: + def _quantize_layer_via_rtn(self, name: str, dtype:torch.dtype=None) -> None: """Quantizes a layer using RTN (Round-To-Nearest) if available. This function attempts to quantize a layer by switching its data type to a @@ -1240,13 +1244,14 @@ def _quantize_layer_via_rtn(self, name: str) -> None: RuntimeError: If quantization fails for reasons unrelated to memory. """ m = get_module(self.model, name) + if dtype is not None: + m = m.to(dtype) if is_fp8_linear(m): m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device) set_module(self.model, name, m) # Step 1: Try quantization on GPU first, fall back to CPU if OOM - # if only export gguf, using gguf-packing instead of rtn if self.immediate_packing and self.iters == 0 and "gguf" in self.formats[0] and not self.disable_opt_rtn: m.scale = None m.zp = None @@ -1355,8 +1360,6 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: has_gguf_k = ( any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None ) - if has_gguf_k: - self.model.to(torch.float32) self._quantize_embedding_layer() @@ -1471,6 +1474,12 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) input_others[key] = val.to(tmp_dtype) elif isinstance(val, list): input_others[key] = [to_dtype(v, tmp_dtype) for v in val] + # for name in ["lm_head"]: + # dtype = None + # if self.super_group_size is not None: + # dtype = torch.float32 + # self._quantize_layer_via_rtn(name, dtype=dtype) + # clear_memory(device_list=self.device_list) for block_name in block_names: pbar.set_description(f"Quantizing {block_name}") @@ -1501,6 +1510,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) self.device, self.cache_device, ) + if len(self.device_list) > 1: accelerate.hooks.remove_hook_from_submodules(block) @@ -1508,6 +1518,8 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) # enable moe experts act_max automatic generation for Linear set_amax_for_all_moe_layers(block, attr_name="act_max") # Normalize imatrix and quantize layers + if self.low_gpu_mem_usage: + clear_memory(device_list=self.device_list) for _, m in block.named_modules(): # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1 @@ -1521,18 +1533,13 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) pbar.update(1) pbar.close() - cnt = 1 - block_names_cnt = len(flatten_list(get_block_names(self.model, True))) - clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt - if clear_mem_freq == 0: - clear_mem_freq = 1 # Process remaining layers not in blocks for name in all_to_quantized_module_names: - self._quantize_layer_via_rtn(name) - if cnt % clear_mem_freq == 0: - clear_memory(device_list=self.device_list) - cnt = 1 - cnt += 1 + dtype=None + if self.super_group_size is not None: + dtype=torch.float32 + self._quantize_layer_via_rtn(name, dtype=dtype) + clear_memory(device_list=self.device_list) def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]: keys = inputs.keys() diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 21c5a1e0d..056f74937 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -18,7 +18,7 @@ from auto_round.data_type.register import register_dtype from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES -from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants +from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants, make_qx_quants_chunk from auto_round.logger import logger from auto_round.utils import get_reciprocal from auto_round.utils.device import clear_memory @@ -165,6 +165,37 @@ def double_quant_tensor_sym(tensor, bits): return qdq_tensor, scale +def double_quant_tensor_sym_rtn(tensor, bits): + """ + Inplace-optimized symmetric double quantization. + - Uses float32 inplace where possible + - Minimizes temporary tensor allocations + """ + # Ensure tensor is float32 inplace (if tensor already float32, no copy) + if tensor.dtype != torch.float32: + tensor = tensor.float() # .float() creates a copy if needed + + maxq = 2 ** (bits - 1) + + # Compute absolute max along last dim + # abs_() is inplace + tensor_abs = tensor.abs() # cannot inplace abs on original if we need original sign + imax = tensor_abs.argmax(dim=-1, keepdim=True) + wmax = torch.take_along_dim(tensor, imax, dim=-1) + + # Compute scale inplace + scale = wmax / -maxq + inverse_scale = get_reciprocal(scale) + + # Inplace quantization + qdq_tensor = tensor.mul_(inverse_scale) # tensor * inverse_scale inplace + qdq_tensor = torch.round(qdq_tensor) # round inplace + qdq_tensor.clamp_(-maxq, maxq - 1) # clamp inplace + qdq_tensor.mul_(scale) # multiply scale inplace + + return qdq_tensor, scale + + def make_qp_quants(nmax, data, quant_weights): data = data.to(torch.float32) quant_weights = quant_weights.to(torch.float32) @@ -324,7 +355,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None, split_num=1): super_bits = 4 if bits == 2 else 6 super_group_size = 16 if bits == 2 else 8 - group_size = 16 if bits == 2 else 32 + if bits not in [2, 4, 5]: raise ValueError(f"bits={bits} not supported by rtn_int_asym_dq") quant_weights = None @@ -409,7 +440,7 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri scale = (d_scale * q_scale).view(-1, 1) wmin = (d_wmin * q_wmin).view(-1, 1) if split_num > 1: - clear_memory([tensor.device]) + clear_memory(device_list=[tensor.device]) return scale, wmin, d_scale, d_wmin @@ -458,9 +489,9 @@ def quant_tensor_gguf_asym_dq( inverse_scale = get_reciprocal(scale) tensor = tensor.add_(wmin) - tensor = torch.round(tensor.mul_(inverse_scale)).clamp_(0, maxq) + tensor = (tensor.mul_(inverse_scale)).round_().clamp_(0, maxq) tensor = tensor.mul_(scale) - tensor = tensor.subtract_(wmin).to(orig_dtype) + tensor = tensor.sub_(wmin).to(orig_dtype) tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len) return tensor, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} @@ -640,17 +671,13 @@ def iterative_wls_quant_search( @torch.no_grad() -def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype): - from auto_round.export.export_to_gguf.config import K_SCALE_SIZE, QK_K - - group_size = 16 - +def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num): if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): if bits == 3: scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None) elif bits == 6: - scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None) + scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None,split_num=split_num) else: imatrix = imatrix.to(tensor.device) weights = imatrix.reshape(1, -1) @@ -659,7 +686,7 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype): quant_weights = _imatrix_handle_zero(quant_weights, tensor, bits) - scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=quant_weights) + scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights,split_num=split_num) return scale @@ -697,6 +724,12 @@ def quant_tensor_gguf_sym_dq( maxq = 2 ** (bits - 1) group_size = 16 + split_num=1 + for dim in tensor.shape: + if dim > 100_000: + split_num = 16 + break + tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) orig_dtype = tensor.dtype super_bits = 6 if bits == 3 else 8 @@ -708,18 +741,20 @@ def quant_tensor_gguf_sym_dq( # (nb, 16, 16) tensor = tensor.reshape(n_blocks, super_group_size, QK_K // super_group_size) if scale is None and d_scale is None: - scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype) + scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num=split_num) scale = scale.to(scale_dtype) scale = torch.where(torch.abs(scale) < 1e-30, torch.zeros_like(scale), scale) # conduct double quant - scale, d_scale = double_quant_tensor_sym(scale, super_bits) + scale, d_scale = double_quant_tensor_sym_rtn(scale, super_bits) scale = scale.unsqueeze(-1) - zp = torch.full_like(scale, maxq) # pylint: disable=E1130 + # zp = torch.full_like(scale, maxq) # pylint: disable=E1130 inverse_scale = get_reciprocal(scale) - int_w = round_ste(tensor * inverse_scale).clip(-maxq, maxq - 1) + maxq - qdq_result = (scale * (int_w - zp)).to(orig_dtype) - qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len) + # int_w = round_ste(tensor * inverse_scale).clip(-maxq, maxq - 1) + maxq + # qdq_result = (scale * (int_w - zp)).to(orig_dtype) + tensor = tensor.mul_(inverse_scale).round_().clamp_(-maxq, maxq - 1) + tensor = tensor.mul_(scale).to(orig_dtype) + tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len) - return qdq_result, {"scale": scale, "d_scale": d_scale}, zp + return tensor, {"scale": scale, "d_scale": d_scale}, maxq diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index bc9189b7b..e5e03b7f5 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -85,6 +85,99 @@ def torch_roundf(n): return torch.sign(n) * b +def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1): + """ + Extreme VRAM-optimized version of quantization. + + - Processes data in chunks along the batch dimension (dim=0) to reduce peak memory usage. + - Uses inplace operations to avoid unnecessary tensor copies. + - Reuses buffers for temporary calculations wherever possible. + """ + nmax = 2 ** (bits - 1) + scales_list = [] + L_list = [] + chunk_size = (data.shape[0]+split_num-1)//split_num + for start in range(0, data.shape[0], chunk_size): + end = min(start + chunk_size, data.shape[0]) + chunk = data[start:end] # Slice a batch chunk to reduce memory footprint + + # Compute absolute values inplace to avoid extra tensor allocation + chunk_abs = chunk.abs() + imax = chunk_abs.argmax(dim=-1, keepdim=True) + group_max = torch.take_along_dim(chunk, imax, dim=-1) + + # Compute scale factors (inverse max) without extra tensor + + iscales = -nmax *get_reciprocal(group_max) + + # L buffer stores quantized values, modified inplace to save memory + L = (chunk * iscales).round_().clamp_(-nmax, nmax - 1) + + # Simple case: rmse_type == 0 + if rmse_type == 0: + L.add_(nmax) # Shift to unsigned representation inplace + scales = (1 / iscales).reshape(iscales.shape[:2]) + scales_list.append(scales) + L_list.append(L.to(torch.uint8)) + continue + + return_early = False + if rmse_type < 0: + rmse_type = -rmse_type + return_early = True + + # Compute weighting tensor w based on rmse_type + if qw is not None: + w = qw + elif rmse_type == 1: + w = chunk * chunk + elif rmse_type == 2: + w = torch.ones_like(chunk) + elif rmse_type == 3: + w = chunk.abs() + else: + w = chunk.abs().sqrt() + + # Compute sumlx and suml2 using the pre-allocated L buffer + sumlx = (w * chunk * L).sum(dim=-1) + suml2 = (w * L * L).sum(dim=-1) + scales = sumlx / suml2 + + if return_early: + iscales_inv = (1 / iscales).reshape(iscales.shape[:2]) + # Mix the current scale with inverse scale if suml2 > 0 + scales = torch.where(suml2 > 0, 0.5 * (scales + iscales_inv), iscales_inv) + L.add_(nmax) + scales_list.append(scales) + L_list.append(L.to(torch.uint8)) + continue + + # Iteratively refine scales and quantized values + best = scales * sumlx + for _is in range(-9, 10): + if _is == 0: + continue + iscales_tmp = -(nmax + -0.1 * _is) / group_max + # Use a temporary L buffer to avoid creating new large tensor + L_tmp = (chunk * iscales_tmp).round_().clamp_(-nmax, nmax - 1) + sumlx_tmp = (w * chunk * L_tmp).sum(dim=-1) + suml2_tmp = (w * L_tmp * L_tmp).sum(dim=-1) + # Determine which elements should be replaced + replace_id = (suml2_tmp > 0) & (sumlx_tmp * sumlx_tmp > best * suml2_tmp) + # Inplace update of L and scales + L[replace_id] = L_tmp[replace_id] + scales[replace_id] = sumlx_tmp[replace_id] / suml2_tmp[replace_id] + best[replace_id] = scales[replace_id] * sumlx_tmp[replace_id] + + L.add_(nmax) # Final shift to unsigned + scales_list.append(scales) + L_list.append(L.to(torch.uint8)) + + # Concatenate all chunks along batch dimension + scales = torch.cat(scales_list, dim=0) + L = torch.cat(L_list, dim=0) + return scales, L + def make_qx_quants(data, bits, rmse_type=0, qw=None): """ adapted from llmacpp @@ -248,10 +341,6 @@ def make_qkx2_quants(data, bits, weights=None, rmin=-1.0, rdelta=0.1, nstep=20, return scale.reshape(scale.shape[:2]), L, the_mins.reshape(the_mins.shape[:2]) -def make_qkx3_quants(data, bits, weights, rmin=-1.0, rdelta=0.1, nstep=20, use_mad=False): - return make_qkx2_quants(data, bits, weights, rmin=rmin, rdelta=rdelta, nstep=nstep, use_mad=use_mad) - - def make_qp_quants(nmax, data, quant_weights): group_max = torch.max(data, dim=-1, keepdim=True)[0] scale = group_max / nmax diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index 6b1717e7b..f16bda005 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -307,12 +307,31 @@ def json_serialize(obj: Any): raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") + + + def get_reciprocal(tensor): - if torch.dtype is torch.float16: - tensor = torch.sign(tensor) * torch.clamp(torch.abs(tensor), min=1e-5) - else: - tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor) - return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor)) + """ + Memory-frugal reciprocal: + - Inplace operations on original tensor + - Only allocates small boolean mask + """ + eps = 1e-5 if tensor.dtype == torch.float16 else 1e-30 + + # Create mask for very small elements (small overhead) + mask = tensor.abs() < eps + + # Prepare output in place: reuse tensor if allowed, otherwise create once + recip = torch.empty_like(tensor) + + # Safe reciprocal: for nonzero elements + nonzero_mask = ~mask + recip[nonzero_mask] = 1.0 / tensor[nonzero_mask] + + # Zero out elements below threshold + recip[mask] = 0.0 + + return recip def normalize_input(decoding_layer_inputs: list[tuple[Any]]) -> Tuple[List[torch.Tensor], Dict[str, Any]]: diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index 4d2c7d5fd..4b599d3a5 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -319,7 +319,7 @@ def unwrapper(self, best_params): if self.orig_layer.weight.device.type == "meta": self.orig_layer.to(self.device) - ##unwrapper weight + # Unwrapper weight qdq_weight, scale, zp = self._qdq_weight(v, min_scale, max_scale) # if hasattr(self.orig_layer, "imatrix"): # self.orig_layer.imatrix = None @@ -380,7 +380,7 @@ def _set_dict_attr(attr_dict, attr_name): self.orig_layer.update() self.orig_layer.to("meta") - ##unwrapper act + # Unwrapper act if self.enable_act_quant: if not self.orig_layer.act_dynamic: act_max_scale = best_params.get("act_max_scale", torch.tensor(1.0)).to(self.device) From 0ea4fa230bf72e79805bc9636b6b0727f899cc18 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 06:47:43 +0000 Subject: [PATCH 04/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 8 ++++---- auto_round/data_type/gguf.py | 18 +++++++++--------- auto_round/export/export_to_gguf/packing.py | 5 +++-- auto_round/utils/common.py | 3 --- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 3cd49771b..907da4525 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1093,7 +1093,7 @@ def _quantize_embedding_layer(self): # Attempt quantization on GPU, fall back to CPU if OOM try: weight, scale, zp = quant_func( - module.weight.to(dtype).to(self.device), # + module.weight.to(dtype).to(self.device), # **{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]}, ) except torch.OutOfMemoryError: @@ -1227,7 +1227,7 @@ def get_imatrix_hook(module, input, output): for hook in hooks: hook.remove() - def _quantize_layer_via_rtn(self, name: str, dtype:torch.dtype=None) -> None: + def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None: """Quantizes a layer using RTN (Round-To-Nearest) if available. This function attempts to quantize a layer by switching its data type to a @@ -1535,9 +1535,9 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) pbar.close() # Process remaining layers not in blocks for name in all_to_quantized_module_names: - dtype=None + dtype = None if self.super_group_size is not None: - dtype=torch.float32 + dtype = torch.float32 self._quantize_layer_via_rtn(name, dtype=dtype) clear_memory(device_list=self.device_list) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 056f74937..8ffe4ee58 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -188,10 +188,10 @@ def double_quant_tensor_sym_rtn(tensor, bits): inverse_scale = get_reciprocal(scale) # Inplace quantization - qdq_tensor = tensor.mul_(inverse_scale) # tensor * inverse_scale inplace - qdq_tensor = torch.round(qdq_tensor) # round inplace - qdq_tensor.clamp_(-maxq, maxq - 1) # clamp inplace - qdq_tensor.mul_(scale) # multiply scale inplace + qdq_tensor = tensor.mul_(inverse_scale) # tensor * inverse_scale inplace + qdq_tensor = torch.round(qdq_tensor) # round inplace + qdq_tensor.clamp_(-maxq, maxq - 1) # clamp inplace + qdq_tensor.mul_(scale) # multiply scale inplace return qdq_tensor, scale @@ -671,13 +671,13 @@ def iterative_wls_quant_search( @torch.no_grad() -def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num): +def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): if bits == 3: scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None) elif bits == 6: - scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None,split_num=split_num) + scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num) else: imatrix = imatrix.to(tensor.device) weights = imatrix.reshape(1, -1) @@ -686,7 +686,7 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num): quant_weights = _imatrix_handle_zero(quant_weights, tensor, bits) - scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights,split_num=split_num) + scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights, split_num=split_num) return scale @@ -724,7 +724,7 @@ def quant_tensor_gguf_sym_dq( maxq = 2 ** (bits - 1) group_size = 16 - split_num=1 + split_num = 1 for dim in tensor.shape: if dim > 100_000: split_num = 16 @@ -741,7 +741,7 @@ def quant_tensor_gguf_sym_dq( # (nb, 16, 16) tensor = tensor.reshape(n_blocks, super_group_size, QK_K // super_group_size) if scale is None and d_scale is None: - scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num=split_num) + scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num=split_num) scale = scale.to(scale_dtype) scale = torch.where(torch.abs(scale) < 1e-30, torch.zeros_like(scale), scale) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index e5e03b7f5..81b549a4c 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -96,7 +96,7 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1): nmax = 2 ** (bits - 1) scales_list = [] L_list = [] - chunk_size = (data.shape[0]+split_num-1)//split_num + chunk_size = (data.shape[0] + split_num - 1) // split_num for start in range(0, data.shape[0], chunk_size): end = min(start + chunk_size, data.shape[0]) chunk = data[start:end] # Slice a batch chunk to reduce memory footprint @@ -108,7 +108,7 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1): # Compute scale factors (inverse max) without extra tensor - iscales = -nmax *get_reciprocal(group_max) + iscales = -nmax * get_reciprocal(group_max) # L buffer stores quantized values, modified inplace to save memory L = (chunk * iscales).round_().clamp_(-nmax, nmax - 1) @@ -178,6 +178,7 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1): L = torch.cat(L_list, dim=0) return scales, L + def make_qx_quants(data, bits, rmse_type=0, qw=None): """ adapted from llmacpp diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index f16bda005..3241f0cb1 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -307,9 +307,6 @@ def json_serialize(obj: Any): raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") - - - def get_reciprocal(tensor): """ Memory-frugal reciprocal: From d40de66e76a4293bc18e7d310f56301498502dac Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 16:52:47 +0800 Subject: [PATCH 05/57] update --- README.md | 7 ++-- auto_round/compressors/base.py | 3 +- auto_round/export/export_to_gguf/convert.py | 4 ++- auto_round/export/export_to_gguf/export.py | 3 +- auto_round/export/export_to_gguf/packing.py | 37 +++++++++++++-------- 5 files changed, 33 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index d2ec512a2..367a08c7c 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round") - **`layer_config` (dict)**: Configuration for weight quantization (default is `None`), mainly for mixed schemes. ##### Algorithm Settings -- **`enable_alg_ext` (bool)**: [Experimental Feature] Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`. +- **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0` Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`. - **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled). ##### Tuning Process Parameters @@ -212,7 +212,8 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round") -### AutoScheme Usage +### Adaptive Bits/Dtype Usage +AutoScheme provide automatically algorithm to provide mixed bits/data_type quantization recipes. For some accuracy result, please refer to this [doc](https://github.com/intel/auto-round/blob/main/docs/auto_scheme_acc.md). Please refer to the [user guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#autoscheme) for more details on AutoScheme. ~~~python from auto_round import AutoRound, AutoScheme @@ -294,7 +295,7 @@ for output in outputs: ### SGLang (Intel GPU/CUDA) -Please note that support for the MoE models and visual language models is currently limited. +**Please note that support for the MoE models and visual language models is currently limited.** ```python import sglang as sgl diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 907da4525..fc06158a4 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1530,6 +1530,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) all_to_quantized_module_names.remove(m.tmp_name) if not self.immediate_saving: mv_module_from_gpu(block) + clear_memory(device_list=self.device_list) pbar.update(1) pbar.close() @@ -1539,7 +1540,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) if self.super_group_size is not None: dtype = torch.float32 self._quantize_layer_via_rtn(name, dtype=dtype) - clear_memory(device_list=self.device_list) + # clear_memory(device_list=self.device_list) def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]: keys = inputs.keys() diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 3ac31932d..64312d332 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -50,7 +50,7 @@ from auto_round.export.export_to_gguf.config import ModelType from auto_round.export.export_to_gguf.packing import ggml_quant -from auto_round.utils import LazyImport, get_module, get_packing_device, is_fp8_model, logger +from auto_round.utils import LazyImport, get_module, get_packing_device, is_fp8_model, logger, clear_memory gguf = LazyImport("gguf") @@ -598,6 +598,8 @@ def prepare_tensors(cls): logger.info( f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype}" f" --> {data_qtype.name}, shape = {shape_str}" ) + if not (hasattr(cls, "current_packing_block") and cls.current_packing_block is not None): + clear_memory(device_list=[orig_device]) cls.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 890a93880..140776087 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -178,7 +178,7 @@ def pack_gguf_layer( last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()} model.last_layer_name_to_block_name = last_layer_name_to_block_name if name in model.last_layer_name_to_block_name: - ##packing block + # Packing block for gguf_model in gguf_model_instance_global: gguf_model.current_packing_block = model.last_layer_name_to_block_name[name] gguf_model.prepare_tensors() @@ -189,7 +189,6 @@ def pack_gguf_layer( m.weight = None if hasattr(m, "bias"): m.bias = None - clear_memory() model.last_layer_name_to_block_name.pop(name) if len(model.last_layer_name_to_block_name) == 0: for gguf_model in gguf_model_instance_global: diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 81b549a4c..0bd76db04 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -528,9 +528,9 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i mins = wmin.reshape((-1, QK_K // 16)) output_d = d_scale.reshape(-1, 1).to(torch.float32) output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) - output_scale = torch.round(scales * get_reciprocal(output_d)).clip(0, 15).to(torch.uint8) - output_scale |= torch.round(mins * get_reciprocal(output_dmin)).clip(0, 15).to(torch.uint8) << 4 - all_L = torch.round((blocks + mins.unsqueeze(-1)) / scales.unsqueeze(-1)).clip(0, 3).to(torch.uint8) + output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) + output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 + all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).clamp_(0,3).to(torch.uint8) elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True) max_scales = torch.max(scales, dim=-1, keepdim=True)[0] @@ -556,10 +556,16 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i replace_ids = d_tmp != 0 all_L[replace_ids] = ( - torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1)) - .clip(0, 3) + blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).round_().div_(d_tmp[replace_ids].unsqueeze(-1)) + .clamp_(0, 3) .to(torch.uint8) ) + + # all_L[replace_ids] = ( + # torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1)) + # .clip(0, 3) + # .to(torch.uint8) + # ) else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq @@ -573,9 +579,9 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i mins = mins.reshape((-1, QK_K // 16)) output_d = d_scale.reshape(-1, 1).to(torch.float32) output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) - output_scale = torch.round(scales * get_reciprocal(output_d)).clip(0, 15).to(torch.uint8) - output_scale |= torch.round(mins * get_reciprocal(output_dmin)).clip(0, 15).to(torch.uint8) << 4 - all_L = torch.round((blocks + mins.unsqueeze(-1)) / scales.unsqueeze(-1)).clip(0, 3).to(torch.uint8) + output_scale = scales.mul(get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) + output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 + all_L = blocks.add_(mins.unsqueeze(-1)).round_().div_(scales.unsqueeze(-1)).clamp_(0, 3).to(torch.uint8) output_scale = output_scale.cpu().numpy() all_L = all_L.reshape(-1, 4, 32) @@ -815,19 +821,22 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, if scale is not None: scales = scale.reshape(-1, QK_K // 16) output_d = d_scale.reshape(-1, 1).to(torch.float32) - output_scale = torch.round(scales * get_reciprocal(output_d)).clip(max=127).to(torch.int8) - all_L = torch.round(blocks * get_reciprocal(scales.unsqueeze(-1)) + 32).clip(0, 63).to(torch.uint8) + rd = get_reciprocal(output_d) + output_scale = scales.mul(rd).round_().clamp_(max=127).to(torch.int8) + rs = get_reciprocal(scales).unsqueeze_(-1) # inplace unsqueeze + all_L = blocks.mul(rs).add_(32).round_().clamp_(0, 63).to(torch.uint8) elif original: scales, all_L = make_qx_quants(blocks, bits=6, rmse_type=1, qw=None) imax = abs(scales).argmax(dim=-1, keepdim=True) max_scales = torch.take_along_dim(scales, imax, dim=-1) + iscales = -128 * get_reciprocal(max_scales) output_d = get_reciprocal(iscales) - output_scale = torch.round(iscales * scales).clip(max=127).to(torch.int8) + output_scale = (iscales * scales).round_().clamp_(max=127).to(torch.int8) d_tmp = output_d * output_scale.to(torch.float32) replace_ids = d_tmp != 0 all_L[replace_ids] = ( - torch.round(blocks[replace_ids] / d_tmp[replace_ids].reshape(-1, 1) + 32).clip(0, 63).to(torch.uint8) + (blocks[replace_ids] / d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8) ) else: from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq @@ -838,8 +847,8 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, blocks = blocks.reshape((nb, QK_K // 16, 16)) scales = scales.reshape((-1, QK_K // 16)) output_d = d_scale.reshape(-1, 1).to(torch.float32) - output_scale = torch.round(scales * get_reciprocal(output_d)).clip(max=127).to(torch.int8) - all_L = torch.round(blocks * get_reciprocal(scales.unsqueeze(-1)) + 32).clip(0, 63).to(torch.uint8) + output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(max=127).to(torch.int8) + all_L = blocks.mul_(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8) tmp_L = all_L.reshape(nb, 4, 64) & 0xF output_ql = (tmp_L[:, ::2] | (tmp_L[:, 1::2] << 4)).reshape(nb, QK_K // 2).cpu().numpy().astype(np.uint8) From 67a1b3433f4a0481de08f4d93e01adfce4f03dec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 09:00:36 +0000 Subject: [PATCH 06/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_gguf/convert.py | 2 +- auto_round/export/export_to_gguf/packing.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 64312d332..c075bbe7c 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -50,7 +50,7 @@ from auto_round.export.export_to_gguf.config import ModelType from auto_round.export.export_to_gguf.packing import ggml_quant -from auto_round.utils import LazyImport, get_module, get_packing_device, is_fp8_model, logger, clear_memory +from auto_round.utils import LazyImport, clear_memory, get_module, get_packing_device, is_fp8_model, logger gguf = LazyImport("gguf") diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 0bd76db04..fd291e815 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -530,7 +530,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 - all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).clamp_(0,3).to(torch.uint8) + all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).clamp_(0, 3).to(torch.uint8) elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True) max_scales = torch.max(scales, dim=-1, keepdim=True)[0] @@ -556,7 +556,10 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i replace_ids = d_tmp != 0 all_L[replace_ids] = ( - blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).round_().div_(d_tmp[replace_ids].unsqueeze(-1)) + blocks[replace_ids] + .add_(dm_tmp[replace_ids].unsqueeze(-1)) + .round_() + .div_(d_tmp[replace_ids].unsqueeze(-1)) .clamp_(0, 3) .to(torch.uint8) ) From 2468f0a42ea504792aec44f1dfa0d71666330248 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 18:23:33 +0800 Subject: [PATCH 07/57] fix bug --- auto_round/utils/device.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 2f63a3a2d..cb27a9be8 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -416,14 +416,15 @@ def _clear_memory_for_cpu_and_cuda( del tensor gc.collect() if torch.cuda.is_available(): - if device_list is None: + if not device_list: torch.cuda.synchronize() # Fix https://github.com/intel/auto-round/issues/1004 torch.cuda.empty_cache() - elif len(device_list) > 1: + elif len(device_list) >= 1: devices = [] for device in device_list: + device = str(device) if not device.startswith("cuda"): continue if ":" in device: @@ -440,6 +441,7 @@ def _clear_memory_for_cpu_and_cuda( @torch._dynamo.disable() def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None): + logger.info("call") from auto_round.utils.device import is_hpex_available if is_hpex_available(): From 0bd2cf92a2937233c39f57402dcfa8af5440fe16 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 10:24:27 +0000 Subject: [PATCH 08/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/utils/device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index cb27a9be8..999b98329 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -416,7 +416,7 @@ def _clear_memory_for_cpu_and_cuda( del tensor gc.collect() if torch.cuda.is_available(): - if not device_list: + if not device_list: torch.cuda.synchronize() # Fix https://github.com/intel/auto-round/issues/1004 torch.cuda.empty_cache() From 77014877a9369f213db3a3ad938c1efee9a745ea Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 19:42:18 +0800 Subject: [PATCH 09/57] update --- auto_round/compressors/base.py | 11 +++-- auto_round/data_type/gguf.py | 39 +++++++++-------- auto_round/data_type/int.py | 7 +-- auto_round/export/export_to_gguf/packing.py | 47 +++++++++++++++------ auto_round/utils/device.py | 2 +- 5 files changed, 64 insertions(+), 42 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index fc06158a4..7d39f2696 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1129,7 +1129,7 @@ def _quantize_embedding_layer(self): del weight del scale del zp - clear_memory(self.device_list) + clear_memory(device_list = self.device_list) return is_quantized @@ -1530,7 +1530,10 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) all_to_quantized_module_names.remove(m.tmp_name) if not self.immediate_saving: mv_module_from_gpu(block) - clear_memory(device_list=self.device_list) + if block_name == block_names[-1]: + clear_memory(input_ids, device_list=self.device_list) + else: + clear_memory(device_list=self.device_list) pbar.update(1) pbar.close() @@ -2846,7 +2849,7 @@ def _quantize_block( if auto_offload: mv_module_from_gpu(block) - clear_memory(input_ids) + clear_memory(input_ids,device_list=self.device_list) return q_outputs, output else: @@ -2854,7 +2857,7 @@ def _quantize_block( accelerate.hooks.remove_hook_from_submodules(block) if auto_offload: mv_module_from_gpu(block) - clear_memory(input_ids) + clear_memory(input_ids,device_list=self.device_list) return None, output diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 8ffe4ee58..bc6669837 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -188,12 +188,12 @@ def double_quant_tensor_sym_rtn(tensor, bits): inverse_scale = get_reciprocal(scale) # Inplace quantization - qdq_tensor = tensor.mul_(inverse_scale) # tensor * inverse_scale inplace - qdq_tensor = torch.round(qdq_tensor) # round inplace - qdq_tensor.clamp_(-maxq, maxq - 1) # clamp inplace - qdq_tensor.mul_(scale) # multiply scale inplace + tensor = tensor.mul_(inverse_scale) # tensor * inverse_scale inplace + tensor = tensor.round_() # round inplace + tensor.clamp_(-maxq, maxq - 1) # clamp inplace + tensor.mul_(scale) # multiply scale inplace - return qdq_tensor, scale + return tensor, scale def make_qp_quants(nmax, data, quant_weights): @@ -448,13 +448,13 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri def quant_tensor_gguf_asym_dq( tensor: torch.Tensor, bits: int = 4, - v=0, scale_dtype=torch.float16, imatrix=None, scale=None, wmin=None, d_scale=None, d_wmin=None, + split_num=None, **kwargs, ): """Quantizes and dequantizes a tensor using asymmetric integer quantization for formats like Q2_K, Q4_K, and Q5_K. @@ -473,11 +473,12 @@ def quant_tensor_gguf_asym_dq( orig_dtype = tensor.dtype maxq = 2**bits - 1 group_size = 16 if bits == 2 else 32 - split_num = 1 - for dim in tensor.shape: - if dim > 100_000: - split_num = 16 - break + if split_num is None: + split_num = 1 + for dim in tensor.shape: + if dim > 100_000: + split_num = 16 + break tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) @@ -674,7 +675,7 @@ def iterative_wls_quant_search( def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): if bits == 3: - scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) + scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) #TODO split num ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None) elif bits == 6: scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num) @@ -687,6 +688,8 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): quant_weights = _imatrix_handle_zero(quant_weights, tensor, bits) scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights, split_num=split_num) + if split_num>1: + clear_memory(device_list=[tensor.device]) return scale @@ -698,6 +701,7 @@ def quant_tensor_gguf_sym_dq( scale=None, d_scale=None, scale_dtype=torch.float16, + split_num=None, **kwargs, ): """Quantize and de-quantize tensor asymmetrically. For Q3_K, Q6_K. @@ -724,11 +728,12 @@ def quant_tensor_gguf_sym_dq( maxq = 2 ** (bits - 1) group_size = 16 - split_num = 1 - for dim in tensor.shape: - if dim > 100_000: - split_num = 16 - break + if split_num is None: + split_num = 1 + for dim in tensor.shape: + if dim > 100_000: + split_num = 16 + break tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) orig_dtype = tensor.dtype diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py index 8fc6f79a0..c32646da7 100644 --- a/auto_round/data_type/int.py +++ b/auto_round/data_type/int.py @@ -53,11 +53,6 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5 bits: Number of bits for quantization (e.g., 2, 3, 4, 8) group_size: Number of elements to share scale for quantization v: Rounding value perturbation - min_scale: Minimum scale coefficient for tensor - max_scale: Maximum scale coefficient for tensor - tensor_min (Tensor, optional): Minimum tensor value for quantization. Defaults to None. - tensor_max (Tensor, optional): Maximum tensor value for quantization. Defaults to None. - scale_dtype: dtype of the quantized scale,as most kernels only support FP16 or FP32, while this value is import q_scale_thresh: clip the quantized scale's magnitude to this value to improve the numerical stability Returns: @@ -79,7 +74,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5 scale = search_scales(tensor, bits, qw=imatrix) scale = torch.where(scale < 0, torch.clamp(scale, max=-q_scale_thresh), torch.clamp(scale, min=q_scale_thresh)) - int_w = round_ste(tensor / scale + v) + int_w = torch.round(tensor / scale) q = torch.clamp(int_w, -maxq, maxq - 1) qdq_result = (scale * q).to(tensor.dtype) qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index fd291e815..b83039dd3 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -52,11 +52,17 @@ def ggml_quant( shape = data.shape n_blocks = data.nelement() // block_size + split_num = 1 + for dim in data.shape: + if dim > 100_000: + split_num = 16 + break + blocks = data.reshape((n_blocks, block_size)) quant_func = GGML_QUANT_TYPE[ggml_type] try: new_data = quant_func( - blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original + blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original,split_num=split_num ) except Exception: device = "cpu" @@ -69,7 +75,7 @@ def ggml_quant( imatrix = imatrix.to(device) if imatrix is not None else imatrix clear_memory() new_data = quant_func( - blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original + blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original,split_num=split_num ) assert new_data.shape[-1] == type_size @@ -518,9 +524,9 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray: @register_qtype("q2_k") -def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs): +def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False,split_num=None, **kwargs): nb = blocks.shape[0] - + device=blocks.device blocks = blocks.reshape((nb, QK_K // 16, 16)) # (nb, 16, 16) if scale is not None: @@ -573,9 +579,16 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix) + blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] + if split_num is not None and split_num>1: + blocks = blocks.to("cpu") + scales = scales.to("cpu") + d_scale = d_scale.to("cpu") + mins = mins.to("cpu") + d_wmin = d_wmin.to("cpu") + clear_memory(device_list=[device]) blocks = blocks.reshape((nb, QK_K // 16, 16)) scales = scales.reshape((-1, QK_K // 16)) @@ -600,7 +613,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i @register_qtype("q3_k") -def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs): +def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs): nb = blocks.shape[0] blocks = blocks.reshape(nb, QK_K // 16, 16) @@ -626,7 +639,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq blocks = blocks.reshape(blocks.shape[0], -1) - blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix) + blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) scales, d_scale = scales["scale"], scales["d_scale"] blocks = blocks.reshape((nb, QK_K // 16, 16)) qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32) @@ -653,7 +666,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, @register_qtype("q4_k") -def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs): +def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False,split_num=None, **kwargs): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) @@ -694,7 +707,7 @@ def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix) + blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] @@ -733,7 +746,7 @@ def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i @register_qtype("q5_k") def q5_k_quant_block( - blocks, scale=None, zp=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs + blocks, scale=None, zp=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) @@ -775,7 +788,7 @@ def q5_k_quant_block( from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix) + blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] @@ -817,10 +830,10 @@ def q5_k_quant_block( @register_qtype("q6_k") -def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs): +def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None,split_num=None, **kwargs): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 16, 16)) - + device = blocks.device if scale is not None: scales = scale.reshape(-1, QK_K // 16) output_d = d_scale.reshape(-1, 1).to(torch.float32) @@ -845,8 +858,14 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq blocks = blocks.reshape(blocks.shape[0], -1) - blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix) + blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) scales, d_scale = scales["scale"], scales["d_scale"] + if split_num is not None and split_num>1: + blocks = blocks.to("cpu") + scales = scales.to("cpu") + d_scale = d_scale.to("cpu") + clear_memory(device_list=[device]) + blocks = blocks.reshape((nb, QK_K // 16, 16)) scales = scales.reshape((-1, QK_K // 16)) output_d = d_scale.reshape(-1, 1).to(torch.float32) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index cb27a9be8..f0f007064 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -441,7 +441,7 @@ def _clear_memory_for_cpu_and_cuda( @torch._dynamo.disable() def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None): - logger.info("call") + # logger.info("call") from auto_round.utils.device import is_hpex_available if is_hpex_available(): From 42e5cc0dd86362b82a968a988f5af271cdcca5a4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Nov 2025 11:45:13 +0000 Subject: [PATCH 10/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 6 +- auto_round/data_type/gguf.py | 4 +- auto_round/export/export_to_gguf/packing.py | 73 ++++++++++++++++----- 3 files changed, 63 insertions(+), 20 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 7d39f2696..36aff9e40 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1129,7 +1129,7 @@ def _quantize_embedding_layer(self): del weight del scale del zp - clear_memory(device_list = self.device_list) + clear_memory(device_list=self.device_list) return is_quantized @@ -2849,7 +2849,7 @@ def _quantize_block( if auto_offload: mv_module_from_gpu(block) - clear_memory(input_ids,device_list=self.device_list) + clear_memory(input_ids, device_list=self.device_list) return q_outputs, output else: @@ -2857,7 +2857,7 @@ def _quantize_block( accelerate.hooks.remove_hook_from_submodules(block) if auto_offload: mv_module_from_gpu(block) - clear_memory(input_ids,device_list=self.device_list) + clear_memory(input_ids, device_list=self.device_list) return None, output diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index bc6669837..822c75f9b 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -675,7 +675,7 @@ def iterative_wls_quant_search( def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): if bits == 3: - scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) #TODO split num + scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) # TODO split num ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None) elif bits == 6: scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num) @@ -688,7 +688,7 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): quant_weights = _imatrix_handle_zero(quant_weights, tensor, bits) scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights, split_num=split_num) - if split_num>1: + if split_num > 1: clear_memory(device_list=[tensor.device]) return scale diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index b83039dd3..be62a1340 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -62,7 +62,15 @@ def ggml_quant( quant_func = GGML_QUANT_TYPE[ggml_type] try: new_data = quant_func( - blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original,split_num=split_num + blocks, + scale, + zp=zp, + wmin=wmin, + d_scale=d_scale, + d_wmin=d_wmin, + imatrix=imatrix, + original=original, + split_num=split_num, ) except Exception: device = "cpu" @@ -75,7 +83,15 @@ def ggml_quant( imatrix = imatrix.to(device) if imatrix is not None else imatrix clear_memory() new_data = quant_func( - blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original,split_num=split_num + blocks, + scale, + zp=zp, + wmin=wmin, + d_scale=d_scale, + d_wmin=d_wmin, + imatrix=imatrix, + original=original, + split_num=split_num, ) assert new_data.shape[-1] == type_size @@ -524,9 +540,11 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray: @register_qtype("q2_k") -def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False,split_num=None, **kwargs): +def q2_k_quant_block( + blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs +): nb = blocks.shape[0] - device=blocks.device + device = blocks.device blocks = blocks.reshape((nb, QK_K // 16, 16)) # (nb, 16, 16) if scale is not None: @@ -579,10 +597,12 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) + blocks, scales, mins = quant_tensor_gguf_asym_dq( + blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] - if split_num is not None and split_num>1: + if split_num is not None and split_num > 1: blocks = blocks.to("cpu") scales = scales.to("cpu") d_scale = d_scale.to("cpu") @@ -613,7 +633,9 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i @register_qtype("q3_k") -def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs): +def q3_k_quant_block( + blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs +): nb = blocks.shape[0] blocks = blocks.reshape(nb, QK_K // 16, 16) @@ -639,7 +661,9 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq blocks = blocks.reshape(blocks.shape[0], -1) - blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) + blocks, scales, _ = quant_tensor_gguf_sym_dq( + blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + ) scales, d_scale = scales["scale"], scales["d_scale"] blocks = blocks.reshape((nb, QK_K // 16, 16)) qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32) @@ -666,7 +690,9 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, @register_qtype("q4_k") -def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False,split_num=None, **kwargs): +def q4_k_quant_block( + blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs +): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) @@ -707,7 +733,9 @@ def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) + blocks, scales, mins = quant_tensor_gguf_asym_dq( + blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] @@ -746,7 +774,16 @@ def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i @register_qtype("q5_k") def q5_k_quant_block( - blocks, scale=None, zp=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs + blocks, + scale=None, + zp=None, + wmin=None, + d_scale=None, + d_wmin=None, + imatrix=None, + original=False, + split_num=None, + **kwargs, ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) @@ -788,7 +825,9 @@ def q5_k_quant_block( from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) + blocks, scales, mins = quant_tensor_gguf_asym_dq( + blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] @@ -830,7 +869,9 @@ def q5_k_quant_block( @register_qtype("q6_k") -def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None,split_num=None, **kwargs): +def q6_k_quant_block( + blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs +): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 16, 16)) device = blocks.device @@ -858,9 +899,11 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq blocks = blocks.reshape(blocks.shape[0], -1) - blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num) + blocks, scales, _ = quant_tensor_gguf_sym_dq( + blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + ) scales, d_scale = scales["scale"], scales["d_scale"] - if split_num is not None and split_num>1: + if split_num is not None and split_num > 1: blocks = blocks.to("cpu") scales = scales.to("cpu") d_scale = d_scale.to("cpu") From 1307dddb23e14a530764f679591adaca4328fac9 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 19:49:05 +0800 Subject: [PATCH 11/57] git push --- auto_round/utils/device.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 782c5f76d..eec60ff45 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -436,6 +436,7 @@ def _clear_memory_for_cpu_and_cuda( torch.cuda.synchronize(device) torch.cuda.empty_cache() if torch.xpu.is_available(): + torch.xpu.synchronize() torch.xpu.empty_cache() From 6d6d86ab8e8df6587772c49001ce3a2462397c4e Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 21:40:32 +0800 Subject: [PATCH 12/57] fix accuracy bug --- auto_round/export/export_to_gguf/packing.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index be62a1340..d3c5a38e4 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -482,7 +482,7 @@ def q5_0_quant_block(blocks: np.array, scale=None, zp=None, **kwargs): block_size = GGML_QUANT_SIZES["q5_0"][0] # FIXME: Q5_0's reference rounding is cursed and depends on FMA - q = torch.trunc(blocks.to(torch.float64) * id.to(torch.float64) + 16.5).clip(0, 31).to(torch.uint8).cpu().numpy() + q = torch.trunc(blocks.to(torch.float64) * id.to(torch.float64) + 16.5).clamp_(0, 31).to(torch.uint8).cpu().numpy() qs = q.reshape((n_blocks, 2, block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) @@ -508,7 +508,7 @@ def q5_1_quant_block(blocks: np.array, scale=None, zp=None, **kwargs): block_size = GGML_QUANT_SIZES["q5_1"][0] id = get_reciprocal(d) - q = torch.trunc((blocks - min) * id + 0.5).clip(0, 31).to(torch.uint8).cpu().numpy() + q = torch.trunc(blocks.sub_(min).mul_(id).add_ (0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() qs = q.reshape((n_blocks, 2, block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) @@ -528,7 +528,6 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray: else: d = torch.abs(blocks).max(dim=1, keepdim=True)[0] / 127 id = get_reciprocal(d) - qs = torch.clip(torch_roundf(blocks * id), -128, 127) # (n_blocks, 2) @@ -554,7 +553,7 @@ def q2_k_quant_block( output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 - all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).clamp_(0, 3).to(torch.uint8) + all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True) max_scales = torch.max(scales, dim=-1, keepdim=True)[0] @@ -582,8 +581,7 @@ def q2_k_quant_block( all_L[replace_ids] = ( blocks[replace_ids] .add_(dm_tmp[replace_ids].unsqueeze(-1)) - .round_() - .div_(d_tmp[replace_ids].unsqueeze(-1)) + .div_(d_tmp[replace_ids].unsqueeze(-1)).round_() .clamp_(0, 3) .to(torch.uint8) ) @@ -617,7 +615,7 @@ def q2_k_quant_block( output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) output_scale = scales.mul(get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 - all_L = blocks.add_(mins.unsqueeze(-1)).round_().div_(scales.unsqueeze(-1)).clamp_(0, 3).to(torch.uint8) + all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) output_scale = output_scale.cpu().numpy() all_L = all_L.reshape(-1, 4, 32) From e2586f95c378bf65ff3189214b322ec45f7cdff8 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 21:47:55 +0800 Subject: [PATCH 13/57] trigger ut --- auto_round/export/export_to_gguf/packing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index d3c5a38e4..9e3d92845 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -535,6 +535,7 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray: # (n_blocks, block_size) qs = qs.cpu().numpy().astype(np.int8).view(np.uint8) + return np.concatenate([d, qs], axis=1) From c7b3c241ad2f0644162dde5e7554336253ede357 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 18 Nov 2025 21:53:12 +0800 Subject: [PATCH 14/57] clean code --- auto_round/compressors/base.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 36aff9e40..48f082f3a 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1086,14 +1086,15 @@ def _quantize_embedding_layer(self): quant_func = QUANT_FUNC_WITH_DTYPE[dtype] dtype = module.weight.dtype - # As typically float32 are used in RTN to search scale zp, to avoid cache a bf16 copy we'd better use float32 + # As typically float32 are used in RTN to search scale zp, + # to avoid cache a bf16 copy we'd better use float32 if config["super_group_size"] is not None: dtype = torch.float32 # Attempt quantization on GPU, fall back to CPU if OOM try: weight, scale, zp = quant_func( - module.weight.to(dtype).to(self.device), # + module.weight.to(dtype).to(self.device), **{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]}, ) except torch.OutOfMemoryError: @@ -1474,12 +1475,6 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) input_others[key] = val.to(tmp_dtype) elif isinstance(val, list): input_others[key] = [to_dtype(v, tmp_dtype) for v in val] - # for name in ["lm_head"]: - # dtype = None - # if self.super_group_size is not None: - # dtype = torch.float32 - # self._quantize_layer_via_rtn(name, dtype=dtype) - # clear_memory(device_list=self.device_list) for block_name in block_names: pbar.set_description(f"Quantizing {block_name}") From 8ad2019e3019c03b316d0e0242c516bebb524d07 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 11:06:00 +0800 Subject: [PATCH 15/57] q80 q4k --- auto_round/export/export_to_gguf/packing.py | 60 ++++++++++----------- auto_round/utils/device.py | 56 +++++++++++++------ 2 files changed, 69 insertions(+), 47 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 9e3d92845..8fd78d4b2 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -72,7 +72,8 @@ def ggml_quant( original=original, split_num=split_num, ) - except Exception: + except torch.OutOfMemoryError: + orig_device = blocks.device device = "cpu" blocks = blocks.to(device) scale = scale.to(device) if scale is not None else scale @@ -81,7 +82,7 @@ def ggml_quant( d_scale = d_scale.to(device) if d_scale is not None else d_scale d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin imatrix = imatrix.to(device) if imatrix is not None else imatrix - clear_memory() + clear_memory(device_list=orig_device) new_data = quant_func( blocks, scale, @@ -103,7 +104,7 @@ def ggml_quant( def torch_roundf(n): a = torch.abs(n) floored = torch.floor(a) - b = floored + torch.floor(2 * (a - floored)) + b = floored + torch.floor((a - floored).mul_(2)) return torch.sign(n) * b @@ -528,14 +529,14 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray: else: d = torch.abs(blocks).max(dim=1, keepdim=True)[0] / 127 id = get_reciprocal(d) - qs = torch.clip(torch_roundf(blocks * id), -128, 127) + blocks = blocks.mul_(id) + qs = torch_roundf(blocks).clamp_(-128, 127) # (n_blocks, 2) d = d.cpu().numpy().astype(np.float16).view(np.uint8) # (n_blocks, block_size) qs = qs.cpu().numpy().astype(np.int8).view(np.uint8) - return np.concatenate([d, qs], axis=1) @@ -587,11 +588,6 @@ def q2_k_quant_block( .to(torch.uint8) ) - # all_L[replace_ids] = ( - # torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1)) - # .clip(0, 3) - # .to(torch.uint8) - # ) else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq @@ -695,20 +691,17 @@ def q4_k_quant_block( nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) - output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device) if scale is not None: scales = scale.reshape(-1, QK_K // 32) mins = wmin.reshape(-1, QK_K // 32) output_d = d_scale.reshape(-1, 1).to(torch.float32) output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) - q_scales = torch.round(scales * get_reciprocal(output_d)).clip(0, 63).to(torch.uint8) - q_mins = torch.round(mins * get_reciprocal(output_dmin)).clip(0, 63).to(torch.uint8) - all_L = ( - torch.round((blocks + mins.unsqueeze(-1)) * get_reciprocal(scales.unsqueeze(-1))) - .clip(0, 15) - .to(torch.uint8) - ) + q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) + q_mins = (mins * get_reciprocal(output_dmin)).round_().clam_(0, 63).to(torch.uint8) + all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))). + round_().clamp_(0,15).to(torch.uint8)) + elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=4, rmin=-1, rdelta=0.1, nstep=20, use_mad=False) max_scales = torch.max(scales, dim=-1, keepdim=True)[0] @@ -717,17 +710,15 @@ def q4_k_quant_block( id_mins = (63 * get_reciprocal(max_mins)).clamp(min=0) output_d = max_scales / 63 output_dmin = max_mins / 63 - q_scales = torch.round(id_scales * scales).clip(0, 63).to(torch.uint8) - q_mins = torch.round(id_mins * mins).clip(0, 63).to(torch.uint8) + q_scales = (id_scales * scales).round_().clamp_(0, 63).to(torch.uint8) + q_mins = (id_mins * mins).round_().clip(0, 63).to(torch.uint8) d_tmp = output_d * q_scales dm_tmp = output_dmin * q_mins replace_ids = d_tmp != 0 - all_L[replace_ids] = ( - torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1)) - .clip(0, 15) - .to(torch.uint8) - ) + all_L[replace_ids] = (blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)). + div_(d_tmp[replace_ids].unsqueeze(-1)).clamp_(0,15).to(torch.uint8)) + else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq @@ -737,20 +728,28 @@ def q4_k_quant_block( ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] + if split_num > 1: + orig_device = blocks.device + blocks = blocks.to("cpu") + scales = scales.to("cpu") + d_scale = d_scale.to("cpu") + mins = mins.to("cpu") + d_wmin = d_wmin.to("cpu") + blocks = blocks.reshape((nb, QK_K // 32, 32)) scales = scales.reshape((-1, QK_K // 32)) mins = mins.reshape((-1, QK_K // 32)) output_d = d_scale.reshape(-1, 1).to(torch.float32) output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) - q_scales = torch.round(scales * get_reciprocal(output_d)).clip(0, 63).to(torch.uint8) - q_mins = torch.round(mins * get_reciprocal(output_dmin)).clip(0, 63).to(torch.uint8) + q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) + q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - torch.round((blocks + mins.unsqueeze(-1)) * get_reciprocal(scales.unsqueeze(-1))) - .clip(0, 15) + blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_() + .clamp(0, 15) .to(torch.uint8) ) - + output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device) output_scale[:, :4] = q_scales[:, :4] output_scale[:, 4:8] = q_mins[:, :4] @@ -906,7 +905,6 @@ def q6_k_quant_block( blocks = blocks.to("cpu") scales = scales.to("cpu") d_scale = d_scale.to("cpu") - clear_memory(device_list=[device]) blocks = blocks.reshape((nb, QK_K // 16, 16)) scales = scales.reshape((-1, QK_K // 16)) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index eec60ff45..fddb5c969 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -407,39 +407,63 @@ def bytes_to_gigabytes(bytes) -> int: def _clear_memory_for_cpu_and_cuda( - tensor: torch.Tensor | list[torch.Tensor] | None = None, device_list: tuple | list | None = None + tensor: torch.Tensor | list[torch.Tensor] | None = None, + device_list: tuple | list | str | torch.device | None = None ): + # ------------------------ + # Clear CPU-side references + # ------------------------ if isinstance(tensor, list): for i in range(len(tensor)): tensor[i] = None - if tensor is not None: - del tensor + tensor = None gc.collect() + + # ------------------------ + # Normalize device_list + # ------------------------ + if isinstance(device_list, (str, torch.device)): + device_list = [device_list] + + # ----------------------------------- + # CUDA-specific clearing + # ----------------------------------- if torch.cuda.is_available(): + # No device_list → clear all GPUs if not device_list: - torch.cuda.synchronize() # Fix https://github.com/intel/auto-round/issues/1004 + torch.cuda.synchronize() torch.cuda.empty_cache() - - elif len(device_list) >= 1: + else: + # Parse valid CUDA device IDs devices = [] - for device in device_list: - device = str(device) - if not device.startswith("cuda"): + for dev in device_list: + dev = str(dev) + if not dev.startswith("cuda"): continue - if ":" in device: - device = device.split(":")[-1] + # cuda / cuda:0 / cuda:1 + if ":" in dev: + devid = int(dev.split(":")[-1]) else: - device = 0 - devices.append(int(device)) - for device in devices: - torch.cuda.synchronize(device) + devid = 0 + devices.append(devid) + + for d in devices: + torch.cuda.synchronize(d) + torch.cuda.empty_cache() - if torch.xpu.is_available(): + + # ----------------------------------- + # XPU-specific clearing + # ----------------------------------- + if hasattr(torch, "xpu") and torch.xpu.is_available(): torch.xpu.synchronize() torch.xpu.empty_cache() + + + @torch._dynamo.disable() def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None): # logger.info("call") From d3168544cb259785cb351f417e426ec88a80b7ef Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Nov 2025 03:07:32 +0000 Subject: [PATCH 16/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_gguf/packing.py | 29 ++++++++++++++------- auto_round/utils/device.py | 5 +--- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 8fd78d4b2..d8c2578b9 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -509,7 +509,7 @@ def q5_1_quant_block(blocks: np.array, scale=None, zp=None, **kwargs): block_size = GGML_QUANT_SIZES["q5_1"][0] id = get_reciprocal(d) - q = torch.trunc(blocks.sub_(min).mul_(id).add_ (0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() + q = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() qs = q.reshape((n_blocks, 2, block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) @@ -583,7 +583,8 @@ def q2_k_quant_block( all_L[replace_ids] = ( blocks[replace_ids] .add_(dm_tmp[replace_ids].unsqueeze(-1)) - .div_(d_tmp[replace_ids].unsqueeze(-1)).round_() + .div_(d_tmp[replace_ids].unsqueeze(-1)) + .round_() .clamp_(0, 3) .to(torch.uint8) ) @@ -691,7 +692,6 @@ def q4_k_quant_block( nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) - if scale is not None: scales = scale.reshape(-1, QK_K // 32) mins = wmin.reshape(-1, QK_K // 32) @@ -699,8 +699,13 @@ def q4_k_quant_block( output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clam_(0, 63).to(torch.uint8) - all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))). - round_().clamp_(0,15).to(torch.uint8)) + all_L = ( + blocks.add_(mins.unsqueeze(-1)) + .mul_(get_reciprocal(scales.unsqueeze(-1))) + .round_() + .clamp_(0, 15) + .to(torch.uint8) + ) elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=4, rmin=-1, rdelta=0.1, nstep=20, use_mad=False) @@ -716,8 +721,13 @@ def q4_k_quant_block( d_tmp = output_d * q_scales dm_tmp = output_dmin * q_mins replace_ids = d_tmp != 0 - all_L[replace_ids] = (blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)). - div_(d_tmp[replace_ids].unsqueeze(-1)).clamp_(0,15).to(torch.uint8)) + all_L[replace_ids] = ( + blocks[replace_ids] + .add_(dm_tmp[replace_ids].unsqueeze(-1)) + .div_(d_tmp[replace_ids].unsqueeze(-1)) + .clamp_(0, 15) + .to(torch.uint8) + ) else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq @@ -736,7 +746,6 @@ def q4_k_quant_block( mins = mins.to("cpu") d_wmin = d_wmin.to("cpu") - blocks = blocks.reshape((nb, QK_K // 32, 32)) scales = scales.reshape((-1, QK_K // 32)) mins = mins.reshape((-1, QK_K // 32)) @@ -745,7 +754,9 @@ def q4_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_() + blocks.add_(mins.unsqueeze(-1)) + .mul_(get_reciprocal(scales.unsqueeze(-1))) + .round_() .clamp(0, 15) .to(torch.uint8) ) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index fddb5c969..3ab9ae72a 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -408,7 +408,7 @@ def bytes_to_gigabytes(bytes) -> int: def _clear_memory_for_cpu_and_cuda( tensor: torch.Tensor | list[torch.Tensor] | None = None, - device_list: tuple | list | str | torch.device | None = None + device_list: tuple | list | str | torch.device | None = None, ): # ------------------------ # Clear CPU-side references @@ -461,9 +461,6 @@ def _clear_memory_for_cpu_and_cuda( torch.xpu.empty_cache() - - - @torch._dynamo.disable() def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None): # logger.info("call") From 1743472bd92685c417cea544e3007bc0e3c48c3f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 12:15:40 +0800 Subject: [PATCH 17/57] q5k --- auto_round/data_type/gguf.py | 187 +++++++++----------- auto_round/export/export_to_gguf/packing.py | 56 +++--- 2 files changed, 106 insertions(+), 137 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 822c75f9b..4912e6e77 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -497,73 +497,6 @@ def quant_tensor_gguf_asym_dq( return tensor, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} -def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None): - """Adapted from Llamacpp. Performs iterative weighted least squares quantization search. - - Args: - data (torch.Tensor): Input tensor to quantize. - bits (int): Number of quantization bits. - rrmin (float): Initial range scaling factor. - rdelta (float): Step size for range scaling. - nstep (int): Number of search steps. - use_mad (bool): Whether to use mean absolute deviation instead of squared error. - weights (torch.Tensor): Weight matrix for each element. - - Returns: - Tuple: (Optimal scale tensor, optimal minimum value tensor) - """ - dtype = torch.float32 - data = data.to(dtype) - maxq = 2**bits - 1 - minq = 0 - weights = 1.0 if weights is None else weights.to(dtype) - - rmin = torch.min(data, dim=1, keepdim=True)[0] - rmax = torch.max(data, dim=1, keepdim=True)[0] - - sum_w = torch.sum(weights, dim=1, keepdim=True) - sum_x = torch.sum(weights * data, dim=1, keepdim=True) - - # scale = 1 / ((maxq - minq) / (rmax - rmin + 1e-8)) - scale = (rmax - rmin) / (maxq - minq) - iscale = get_reciprocal(scale) - # quant_data = torch.clamp(torch.round((maxq - minq) / (rmax - rmin + 1e-8) * (data - rmin)), minq, maxq) - quant_data = torch.clamp(torch.round(iscale * (data - rmin)), minq, maxq) - diff = scale * quant_data + rmin - data - - best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=1, keepdim=True) - - for is_ in range(nstep): - factor = rrmin + rdelta * is_ + maxq - minq - # iscale_new = factor / (rmax - rmin + 1e-8) - scale_new = (rmax - rmin) / factor - iscale_new = get_reciprocal(scale_new) - quant_data_new = torch.clamp(torch.round(iscale_new * (data - rmin)), minq, maxq) - - mul_weights_quant_data = weights * quant_data_new - sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True) - sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True) - sum_xl = torch.sum(mul_weights_quant_data * data, dim=-1, keepdim=True) - - D = sum_w * sum_l2 - torch.pow(sum_l, 2) - this_scale = (sum_w * sum_xl - sum_x * sum_l) / D - this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D - this_min[this_min > 0] = 0 - this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0] - reverse_this_scale = get_reciprocal(this_scale) - - quant_data = torch.clamp(torch.round(reverse_this_scale * (data - this_min)), minq, maxq) - diff = this_scale * quant_data + this_min - data - # diff = this_scale * quant_data_new + this_min - data - mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=-1, keepdim=True) - - idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0] - best_mad[idx_to_replace] = mad[idx_to_replace] - scale[idx_to_replace] = this_scale[idx_to_replace] - rmin[idx_to_replace] = this_min[idx_to_replace] - - return scale.to(torch.float32), -rmin.to(torch.float32) - # TODO consolidate iterative_wls_quant_search_chunk and non-chunk def iterative_wls_quant_search_chunk( @@ -577,52 +510,99 @@ def iterative_wls_quant_search_chunk( results_scale = [] results_rmin = [] + chunk_size = (data.shape[0] + split_num - 1) // split_num + for start in range(0, data.shape[0], chunk_size): end = min(start + chunk_size, data.shape[0]) chunk = data[start:end] chunk_weights = weights if isinstance(weights, float) else weights[start:end] + # Pre-allocate reusable buffers to avoid new allocations + tmp = torch.empty_like(chunk) + quant_data = torch.empty_like(chunk) + diff = torch.empty_like(chunk) + rmin = torch.min(chunk, dim=1, keepdim=True)[0] rmax = torch.max(chunk, dim=1, keepdim=True)[0] sum_w = torch.sum(chunk_weights, dim=1, keepdim=True) sum_x = torch.sum(chunk_weights * chunk, dim=1, keepdim=True) + scale = (rmax - rmin) / (maxq - minq) iscale = get_reciprocal(scale) - quant_data = torch.clamp(torch.round(iscale * (chunk - rmin)), minq, maxq) - diff = scale * quant_data + rmin - chunk - best_mad = torch.sum( - (chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=1, keepdim=True - ) + + # tmp = (chunk - rmin) * iscale + tmp.copy_(chunk).sub_(rmin).mul_(iscale) + + # quant_data = round(tmp).clamp_() + torch.round(tmp, out=quant_data) + quant_data.clamp_(minq, maxq) + + # diff = scale * quant_data + rmin - chunk + diff.copy_(quant_data).mul_(scale).add_(rmin).sub_(chunk) + + if use_mad: + best_mad = (chunk_weights * diff.abs_()).sum(dim=1, keepdim=True) + else: + diff.pow_(2) + best_mad = (chunk_weights * diff).sum(dim=1, keepdim=True) for is_ in range(nstep): factor = rrmin + rdelta * is_ + maxq - minq + scale_new = (rmax - rmin) / factor iscale_new = get_reciprocal(scale_new) - quant_data_new = torch.clamp(torch.round(iscale_new * (chunk - rmin)), minq, maxq) - mul_weights_quant_data = chunk_weights * quant_data_new - sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True) - sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True) - sum_xl = torch.sum(mul_weights_quant_data * chunk, dim=-1, keepdim=True) - D = sum_w * sum_l2 - torch.pow(sum_l, 2) + + # tmp = (chunk - rmin) * iscale_new + tmp.copy_(chunk).sub_(rmin).mul_(iscale_new) + + torch.round(tmp, out=quant_data) + quant_data.clamp_(minq, maxq) + + # tmp = chunk_weights * quant_data + tmp.copy_(quant_data).mul_(chunk_weights) + + sum_l = tmp.sum(dim=-1, keepdim=True) + sum_l2 = (tmp * quant_data).sum(dim=-1, keepdim=True) + sum_xl = (tmp * chunk).sum(dim=-1, keepdim=True) + + D = sum_w * sum_l2 - sum_l * sum_l + this_scale = (sum_w * sum_xl - sum_x * sum_l) / D this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D - this_min[this_min > 0] = 0 - this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0] + + mask = this_min > 0 + if mask.any(): + this_min[mask] = 0 + this_scale[mask] = (sum_xl / sum_l2)[mask] + reverse_this_scale = get_reciprocal(this_scale) - quant_data = torch.clamp(torch.round(reverse_this_scale * (chunk - this_min)), minq, maxq) - diff = this_scale * quant_data + this_min - chunk - mad = torch.sum( - (chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), - dim=-1, - keepdim=True, - ) + + # tmp = (chunk - this_min) * reverse_this_scale + tmp.copy_(chunk).sub_(this_min).mul_(reverse_this_scale) + + torch.round(tmp, out=quant_data) + quant_data.clamp_(minq, maxq) + + # diff = this_scale * quant_data + this_min - chunk + diff.copy_(quant_data).mul_(this_scale).add_(this_min).sub_(chunk) + + if use_mad: + mad = (chunk_weights * diff.abs_()).sum(dim=-1, keepdim=True) + else: + diff.pow_(2) + mad = (chunk_weights * diff).sum(dim=-1, keepdim=True) + idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0] + best_mad[idx_to_replace] = mad[idx_to_replace] scale[idx_to_replace] = this_scale[idx_to_replace] rmin[idx_to_replace] = this_min[idx_to_replace] + results_scale.append(scale.to(torch.float32)) results_rmin.append(-rmin.to(torch.float32)) + + # YOUR ORIGINAL LOGIC — kept unchanged if split_num > 1: clear_memory(device_list=[data.device]) @@ -648,27 +628,18 @@ def iterative_wls_quant_search( """ # TODO this one should change to try catch later - if split_num > 1: - return iterative_wls_quant_search_chunk( - data=data, - bits=bits, - rrmin=rrmin, - rdelta=rdelta, - nstep=nstep, - use_mad=use_mad, - weights=weights, - split_num=split_num, - ) - else: - return iterative_wls_quant_search_non_chunk( - data=data, - bits=bits, - rrmin=rrmin, - rdelta=rdelta, - nstep=nstep, - use_mad=use_mad, - weights=weights, - ) + + return iterative_wls_quant_search_chunk( + data=data, + bits=bits, + rrmin=rrmin, + rdelta=rdelta, + nstep=nstep, + use_mad=use_mad, + weights=weights, + split_num=split_num, + ) + @torch.no_grad() diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index d8c2578b9..912d543cf 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -640,15 +640,15 @@ def q3_k_quant_block( if scale is not None: qdq_scale = scale.reshape(-1, QK_K // 16).to(torch.float32) dq_scale = d_scale.reshape(-1, 1).to(torch.float32) - all_L = (torch.round(blocks * get_reciprocal(qdq_scale.unsqueeze(-1))).clip(-4, 3) + 4).to(torch.uint8) - q_scales_offset = torch.round(qdq_scale * get_reciprocal(dq_scale)).clip(-32, 31) + 32 + all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) + q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32) elif original: ## this is correct scales, _ = make_q3_quants(blocks, bits=3, do_rmse=True) scales_abs_max = abs(scales).argmax(dim=-1, keepdim=True) max_scales_mag = torch.take_along_dim(scales, scales_abs_max, dim=-1) inverse_dq_scale = -32 * get_reciprocal(max_scales_mag) dq_scale = get_reciprocal(inverse_dq_scale) - qscale = torch.round(inverse_dq_scale * scales).clip(-32, 31) + qscale = (inverse_dq_scale * scales).round_().clamp_(-32, 31) qdq_scale = dq_scale.to(torch.float32) * qscale reverse_qdq_scale = get_reciprocal(qdq_scale) all_L = (torch.round(blocks * reverse_qdq_scale.unsqueeze(-1)).clip(-4, 3) + 4).to(torch.uint8) @@ -687,7 +687,7 @@ def q3_k_quant_block( @register_qtype("q4_k") def q4_k_quant_block( - blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs + blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) @@ -698,7 +698,7 @@ def q4_k_quant_block( output_d = d_scale.reshape(-1, 1).to(torch.float32) output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) - q_mins = (mins * get_reciprocal(output_dmin)).round_().clam_(0, 63).to(torch.uint8) + q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( blocks.add_(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) @@ -738,8 +738,7 @@ def q4_k_quant_block( ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] - if split_num > 1: - orig_device = blocks.device + if split_num is not None and split_num > 1: blocks = blocks.to("cpu") scales = scales.to("cpu") d_scale = d_scale.to("cpu") @@ -791,26 +790,22 @@ def q5_k_quant_block( d_wmin=None, imatrix=None, original=False, - split_num=None, + split_num=1, **kwargs, ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) - output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device) - if scale is not None: scales = scale.reshape(-1, QK_K // 32) mins = wmin.reshape(-1, QK_K // 32) output_d = d_scale.reshape(-1, 1).to(torch.float32) output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) - q_scales = torch.round(scales * get_reciprocal(output_d)).clip(0, 63).to(torch.uint8) - q_mins = torch.round(mins * get_reciprocal(output_dmin)).clip(0, 63).to(torch.uint8) - all_L = ( - torch.round((blocks + mins.unsqueeze(-1)) * get_reciprocal(scales.unsqueeze(-1))) - .clip(0, 31) - .to(torch.uint8) - ) + q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) + q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) + all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))). + round_().clamp_(0, 31).to(torch.uint8)) + elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=5, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=False) max_scales = torch.max(scales, dim=-1, keepdim=True)[0] @@ -819,17 +814,14 @@ def q5_k_quant_block( id_mins = (63 * get_reciprocal(max_mins)).clamp(min=0) output_d = max_scales / 63 output_dmin = max_mins / 63 - q_scales = torch.round(id_scales * scales).clip(0, 63).to(torch.uint8) - q_mins = torch.round(id_mins * mins).clip(0, 63).to(torch.uint8) + q_scales = (id_scales * scales).round_().clamp_(0, 63).to(torch.uint8) + q_mins = (id_mins * mins).round_().clamp_(0, 63).to(torch.uint8) d_tmp = output_d * q_scales dm_tmp = output_dmin * q_mins replace_ids = d_tmp != 0 - all_L[replace_ids] = ( - torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1)) - .clip(0, 31) - .to(torch.uint8) - ) + all_L[replace_ids]=(blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)). + div_(d_tmp[replace_ids].unsqueeze(-1)).round_().clamp_(0,31).to(torch.int8)) else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq @@ -839,19 +831,25 @@ def q5_k_quant_block( ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] + if split_num is not None and split_num > 1: + blocks = blocks.to("cpu") + scales = scales.to("cpu") + d_scale = d_scale.to("cpu") + mins = mins.to("cpu") + d_wmin = d_wmin.to("cpu") blocks = blocks.reshape((nb, QK_K // 32, 32)) scales = scales.reshape((-1, QK_K // 32)) mins = mins.reshape((-1, QK_K // 32)) output_d = d_scale.reshape(-1, 1).to(torch.float32) output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) - q_scales = torch.round(scales * get_reciprocal(output_d)).clip(0, 63).to(torch.uint8) - q_mins = torch.round(mins * get_reciprocal(output_dmin)).clip(0, 63).to(torch.uint8) - all_L = ( - torch.round((blocks + mins.unsqueeze(-1)) * get_reciprocal(scales.unsqueeze(-1))) - .clip(0, 31) + q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) + q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp(0, 63).to(torch.uint8) + all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_() + .clamp_(0, 31) .to(torch.uint8) ) + output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device) output_scale[:, :4] = q_scales[:, :4] output_scale[:, 4:8] = q_mins[:, :4] From 5ffa12b61bf4e9d76c3f8d8f3edc6ce3e7ed686c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Nov 2025 04:16:24 +0000 Subject: [PATCH 18/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/data_type/gguf.py | 2 -- auto_round/export/export_to_gguf/packing.py | 24 ++++++++++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 4912e6e77..84ef77f29 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -497,7 +497,6 @@ def quant_tensor_gguf_asym_dq( return tensor, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin} - # TODO consolidate iterative_wls_quant_search_chunk and non-chunk def iterative_wls_quant_search_chunk( data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8 @@ -641,7 +640,6 @@ def iterative_wls_quant_search( ) - @torch.no_grad() def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 912d543cf..6779a81ef 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -803,8 +803,13 @@ def q5_k_quant_block( output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) - all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))). - round_().clamp_(0, 31).to(torch.uint8)) + all_L = ( + blocks.add_(mins.unsqueeze(-1)) + .mul_(get_reciprocal(scales.unsqueeze(-1))) + .round_() + .clamp_(0, 31) + .to(torch.uint8) + ) elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=5, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=False) @@ -820,8 +825,14 @@ def q5_k_quant_block( d_tmp = output_d * q_scales dm_tmp = output_dmin * q_mins replace_ids = d_tmp != 0 - all_L[replace_ids]=(blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)). - div_(d_tmp[replace_ids].unsqueeze(-1)).round_().clamp_(0,31).to(torch.int8)) + all_L[replace_ids] = ( + blocks[replace_ids] + .add_(dm_tmp[replace_ids].unsqueeze(-1)) + .div_(d_tmp[replace_ids].unsqueeze(-1)) + .round_() + .clamp_(0, 31) + .to(torch.int8) + ) else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq @@ -845,7 +856,10 @@ def q5_k_quant_block( output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp(0, 63).to(torch.uint8) - all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_() + all_L = ( + blocks.add_(mins.unsqueeze(-1)) + .mul_(get_reciprocal(scales.unsqueeze(-1))) + .round_() .clamp_(0, 31) .to(torch.uint8) ) From db5c64237aebf106eda9fd797d3cc646bf7010eb Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 13:19:30 +0800 Subject: [PATCH 19/57] all ggufs use inplace ops --- auto_round/export/export_to_gguf/packing.py | 32 +++++++++++++-------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 912d543cf..a5fc92d38 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -433,10 +433,10 @@ def q4_0_quant_block(blocks, scale=None, zp=None, **kwargs): max = torch.take_along_dim(blocks, imax, dim=-1) d = max / -8 id = get_reciprocal(d) + n_blocks = blocks.shape[0] + qs = torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(8.5)).clamp_(0, 15).to(torch.uint8) - qs = torch.trunc(blocks.to(torch.float64) * id.to(torch.float64) + 8.5).clip(0, 15).to(torch.uint8) - n_blocks = blocks.shape[0] block_size = GGML_QUANT_SIZES["q4_0"][0] qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy() qs = qs[..., 0, :] | (qs[..., 1, :] << 4) @@ -456,10 +456,11 @@ def q4_1_quant_block(blocks, scale=None, zp=None, **kwargs): min = blocks.min(axis=-1, keepdims=True)[0] d = (max - min) / 15 id = get_reciprocal(d) + n_blocks = blocks.shape[0] + + qs = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8) - qs = torch.trunc((blocks - min) * id + 0.5).clip(0, 15).to(torch.uint8) - n_blocks = blocks.shape[0] block_size = GGML_QUANT_SIZES["q4_1"][0] qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy() qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) @@ -483,7 +484,7 @@ def q5_0_quant_block(blocks: np.array, scale=None, zp=None, **kwargs): block_size = GGML_QUANT_SIZES["q5_0"][0] # FIXME: Q5_0's reference rounding is cursed and depends on FMA - q = torch.trunc(blocks.to(torch.float64) * id.to(torch.float64) + 16.5).clamp_(0, 31).to(torch.uint8).cpu().numpy() + q = torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(16.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() qs = q.reshape((n_blocks, 2, block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) @@ -635,14 +636,14 @@ def q3_k_quant_block( nb = blocks.shape[0] blocks = blocks.reshape(nb, QK_K // 16, 16) - output_scale = np.empty((nb, K_SCALE_SIZE), dtype=np.uint8) + if scale is not None: qdq_scale = scale.reshape(-1, QK_K // 16).to(torch.float32) dq_scale = d_scale.reshape(-1, 1).to(torch.float32) all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32) - elif original: ## this is correct + elif original: scales, _ = make_q3_quants(blocks, bits=3, do_rmse=True) scales_abs_max = abs(scales).argmax(dim=-1, keepdim=True) max_scales_mag = torch.take_along_dim(scales, scales_abs_max, dim=-1) @@ -651,8 +652,8 @@ def q3_k_quant_block( qscale = (inverse_dq_scale * scales).round_().clamp_(-32, 31) qdq_scale = dq_scale.to(torch.float32) * qscale reverse_qdq_scale = get_reciprocal(qdq_scale) - all_L = (torch.round(blocks * reverse_qdq_scale.unsqueeze(-1)).clip(-4, 3) + 4).to(torch.uint8) - q_scales_offset = torch.round(qdq_scale * inverse_dq_scale).clip(-32, 31) + 32 + all_L = blocks.mul_(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8) + q_scales_offset = (qdq_scale * inverse_dq_scale).round_().clamp_(-32, 31).add_(32) else: from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq @@ -661,12 +662,19 @@ def q3_k_quant_block( blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num ) scales, d_scale = scales["scale"], scales["d_scale"] + if split_num is not None and split_num > 1: + blocks = blocks.to("cpu") + scales = scales.to("cpu") + d_scale = d_scale.to("cpu") + blocks = blocks.reshape((nb, QK_K // 16, 16)) qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32) dq_scale = d_scale.reshape(-1, 1).to(torch.float32) - all_L = (torch.round(blocks * get_reciprocal(qdq_scale.unsqueeze(-1))).clip(-4, 3) + 4).to(torch.uint8) - q_scales_offset = torch.round(qdq_scale * get_reciprocal(dq_scale)).clip(-32, 31) + 32 + all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4,3).add_(4).to(torch.uint8) + + q_scales_offset =(qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32) + output_scale = np.empty((nb, K_SCALE_SIZE), dtype=np.uint8) q_scales_offset = q_scales_offset.cpu().numpy().astype(np.uint8) output_scale[:, :8] = (q_scales_offset[:, :8] & 0xF) | ((q_scales_offset[:, 8:] & 0xF) << 4) hmask = q_scales_offset >> 4 @@ -756,7 +764,7 @@ def q4_k_quant_block( blocks.add_(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() - .clamp(0, 15) + .clamp_(0, 15) .to(torch.uint8) ) output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device) From ec6cb4629ebae417e3a7ab420b1955b508cf1d01 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 13:47:32 +0800 Subject: [PATCH 20/57] update --- auto_round/data_type/gguf.py | 11 ++-- auto_round/data_type/int.py | 60 +++++++++++++++----- auto_round/export/export_to_gguf/packing.py | 62 ++++++++++++++++++--- auto_round/utils/device.py | 1 - 4 files changed, 105 insertions(+), 29 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 4912e6e77..e6de5ed6f 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -351,13 +351,12 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens return imatrix.reshape(weight.shape) -@torch.no_grad() +@torch.inference_mode() def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None, split_num=1): super_bits = 4 if bits == 2 else 6 super_group_size = 16 if bits == 2 else 8 - if bits not in [2, 4, 5]: - raise ValueError(f"bits={bits} not supported by rtn_int_asym_dq") + quant_weights = None if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): search_kwargs = { @@ -470,6 +469,8 @@ def quant_tensor_gguf_asym_dq( Returns: Tuple: (Quantized-dequantized tensor, scale dictionary, zero-point dictionary) """ + if bits not in [2, 4, 5]: + raise ValueError(f"bits={bits} not supported by rtn_int_asym_dq") orig_dtype = tensor.dtype maxq = 2**bits - 1 group_size = 16 if bits == 2 else 32 @@ -602,7 +603,6 @@ def iterative_wls_quant_search_chunk( results_scale.append(scale.to(torch.float32)) results_rmin.append(-rmin.to(torch.float32)) - # YOUR ORIGINAL LOGIC — kept unchanged if split_num > 1: clear_memory(device_list=[data.device]) @@ -641,8 +641,7 @@ def iterative_wls_quant_search( ) - -@torch.no_grad() +@torch.inference_mode() def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): if bits == 3: diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py index c32646da7..325c9d40a 100644 --- a/auto_round/data_type/int.py +++ b/auto_round/data_type/int.py @@ -22,25 +22,60 @@ def search_scales(data: torch.Tensor, bits: int, qw: Union[None, torch.Tensor, float] = None) -> torch.Tensor: - nmax = pow(2, bits - 1) + # Maximum absolute value for symmetric quantization + nmax = 1 << (bits - 1) # equivalent to pow(2, bits-1) + + # Find per-group max along the last dimension imax = torch.abs(data).argmax(dim=-1, keepdim=True) group_max = torch.take_along_dim(data, imax, dim=-1) + + # Compute initial inverse scales iscales = -nmax * get_reciprocal(group_max) - scales = get_reciprocal(iscales) - L = torch.round(1.0 * iscales * data).clip(-nmax, nmax - 1) + scales = get_reciprocal(iscales) # scale = 1 / iscales + + # Initial quantized values (in-place round and clamp) + L = torch.empty_like(data) + torch.round(iscales * data, out=L) + L.clamp_(-nmax, nmax - 1) + + # Set default weight if None if qw is None: qw = 1.0 - best_loss = torch.sum(((scales * L - data).to(torch.float32)) ** 2 * qw, dim=-1) + + # Compute initial best loss + best_loss = ((scales * L - data).to(torch.float32)) ** 2 + if isinstance(qw, torch.Tensor): + best_loss.mul_(qw) # inplace multiply by weight + best_loss = torch.sum(best_loss, dim=-1) + + # Iterative search over small adjustments for _is in range(-18 * 5, 18 * 5 + 1): if _is == 0: continue - iscales = -(nmax - 0.01 * _is) * get_reciprocal(group_max) - tmp_L = torch.round(iscales * data).clip(-nmax, nmax - 1) - tmp_scales = get_reciprocal(iscales) - loss = torch.sum(((tmp_scales * tmp_L - data).to(torch.float32)) ** 2 * qw, dim=-1) + + # Update iscales in-place + iscales_tmp = -(nmax - 0.01 * _is) * get_reciprocal(group_max) + + # Compute temporary quantized values (in-place round + clamp) + tmp_L = torch.empty_like(data) + torch.round(iscales_tmp * data, out=tmp_L) + tmp_L.clamp_(-nmax, nmax - 1) + + # Compute temporary scales + tmp_scales = get_reciprocal(iscales_tmp) + + # Compute temporary loss + loss = ((tmp_scales * tmp_L - data).to(torch.float32)) ** 2 + if isinstance(qw, torch.Tensor): + loss.mul_(qw) + loss = torch.sum(loss, dim=-1) + + # Replace scales where loss improves (in-place) replace_id = loss < best_loss - scales[replace_id] = tmp_scales[replace_id] - best_loss[replace_id] = loss[replace_id] + if replace_id.any(): + scales[replace_id] = tmp_scales[replace_id] + best_loss[replace_id] = loss[replace_id] + return scales @@ -74,9 +109,8 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5 scale = search_scales(tensor, bits, qw=imatrix) scale = torch.where(scale < 0, torch.clamp(scale, max=-q_scale_thresh), torch.clamp(scale, min=q_scale_thresh)) - int_w = torch.round(tensor / scale) - q = torch.clamp(int_w, -maxq, maxq - 1) - qdq_result = (scale * q).to(tensor.dtype) + int_w =tensor.div_(scale).round_().clamp_(-maxq, maxq - 1) + qdq_result = (int_w.mul_(scale)).to(tensor.dtype) qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len) return qdq_result, scale, maxq diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index a5fc92d38..0952578e1 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -259,34 +259,78 @@ def make_qx_quants(data, bits, rmse_type=0, qw=None): def make_q3_quants(data, bits, do_rmse=False): - nmax = pow(2, bits - 1) + # Maximum absolute integer value for symmetric quantization + nmax = 1 << (bits - 1) # equivalent to pow(2, bits-1) + + # Find per-group max indices along last dim imax = abs(data).argmax(axis=-1, keepdims=True) + + # Gather group-wise maximum values group_max = torch.take_along_dim(data, imax, dim=-1) + + # Compute inverse scale in-place (multiplying by -nmax) iscale = -nmax * get_reciprocal(group_max) + if do_rmse: - L = torch.round(iscale * data).clip(-nmax, nmax - 1) - w = torch.pow(data, 2) + # Initial quantization L (in-place round and clamp) + L = torch.empty_like(data) + torch.round(iscale * data, out=L) + L.clamp_(-nmax, nmax - 1) + + # Weight for RMSE = x^2 (in-place) + w = data.clone().pow_(2) + + # Precompute sums sumlx = torch.sum(w * data * L, dim=-1) suml2 = torch.sum(w * L * L, dim=-1) - for itry in range(5): + # Iterative RMSE refinement + for _ in range(5): for i in range(sumlx.shape[-1]): - w_tmp, data_tmp, L_tmp = w[:, :, i], data[:, :, i], L[:, :, i] + # Extract current slice + w_tmp = w[:, :, i] + data_tmp = data[:, :, i] + L_tmp = L[:, :, i] + + # Exclude current slice from sums slx = sumlx - w_tmp * data_tmp * L_tmp replace_idx = slx > 0 - sl2 = suml2 - w_tmp * torch.pow(L_tmp, 2) - new_L = torch.round(data_tmp * sl2 / slx).clip(-nmax, nmax - 1) + sl2 = suml2 - w_tmp * L_tmp * L_tmp + + # Compute new L candidate (in-place round and clamp) + new_L = torch.empty_like(L_tmp) + torch.round(data_tmp * sl2 / slx, out=new_L) + new_L.clamp_(-nmax, nmax - 1) + + # Identify positions to update tmp_replace_idx = replace_idx & (new_L != L_tmp) + + # Update sums where L changes slx[tmp_replace_idx] += w_tmp[tmp_replace_idx] * data_tmp[tmp_replace_idx] * new_L[tmp_replace_idx] sl2[tmp_replace_idx] += w_tmp[tmp_replace_idx] * new_L[tmp_replace_idx] * new_L[tmp_replace_idx] + + # Further check condition for improvement replace_idx &= (sl2 > 0) & (slx * slx * suml2 > sumlx * sumlx * sl2) - L[:, :, i][replace_idx] = new_L[replace_idx] + + # Update L in-place + L_tmp[replace_idx] = new_L[replace_idx] + + # Update global sums sumlx = slx suml2 = sl2 + + # Compute final scale and return quantized L return sumlx * get_reciprocal(suml2), L.to(torch.uint8) - L = torch.round(iscale * data).clip(-nmax, nmax - 1) + nmax + # Fast path: quantize without RMSE (in-place round, clamp, shift) + L = torch.empty_like(data) + torch.round(iscale * data, out=L) + L.clamp_(-nmax, nmax - 1) + L.add_(nmax) + + # Compute scales (reciprocal of iscale) scales = get_reciprocal(iscale).reshape(iscale.shape[:2]) + return scales, L.to(torch.uint8) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 3ab9ae72a..0d8413561 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -463,7 +463,6 @@ def _clear_memory_for_cpu_and_cuda( @torch._dynamo.disable() def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None): - # logger.info("call") from auto_round.utils.device import is_hpex_available if is_hpex_available(): From 5a503b4d9c2ad72d010ad52c0060fcb4492a4d7e Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 13:48:54 +0800 Subject: [PATCH 21/57] update --- auto_round/export/export_to_gguf/packing.py | 24 ++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 0952578e1..2714806ed 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -855,8 +855,13 @@ def q5_k_quant_block( output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) - all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))). - round_().clamp_(0, 31).to(torch.uint8)) + all_L = ( + blocks.add_(mins.unsqueeze(-1)) + .mul_(get_reciprocal(scales.unsqueeze(-1))) + .round_() + .clamp_(0, 31) + .to(torch.uint8) + ) elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=5, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=False) @@ -872,8 +877,14 @@ def q5_k_quant_block( d_tmp = output_d * q_scales dm_tmp = output_dmin * q_mins replace_ids = d_tmp != 0 - all_L[replace_ids]=(blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)). - div_(d_tmp[replace_ids].unsqueeze(-1)).round_().clamp_(0,31).to(torch.int8)) + all_L[replace_ids] = ( + blocks[replace_ids] + .add_(dm_tmp[replace_ids].unsqueeze(-1)) + .div_(d_tmp[replace_ids].unsqueeze(-1)) + .round_() + .clamp_(0, 31) + .to(torch.int8) + ) else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq @@ -897,7 +908,10 @@ def q5_k_quant_block( output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp(0, 63).to(torch.uint8) - all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_() + all_L = ( + blocks.add_(mins.unsqueeze(-1)) + .mul_(get_reciprocal(scales.unsqueeze(-1))) + .round_() .clamp_(0, 31) .to(torch.uint8) ) From 737977a992fac1b42440fcdf265121f374cfd019 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Nov 2025 05:50:03 +0000 Subject: [PATCH 22/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/data_type/gguf.py | 1 - auto_round/data_type/int.py | 2 +- auto_round/export/export_to_gguf/packing.py | 16 +++++++++------- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index e580ce0a6..a0eebfb1c 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -356,7 +356,6 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri super_bits = 4 if bits == 2 else 6 super_group_size = 16 if bits == 2 else 8 - quant_weights = None if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): search_kwargs = { diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py index 325c9d40a..8c7b1f261 100644 --- a/auto_round/data_type/int.py +++ b/auto_round/data_type/int.py @@ -109,7 +109,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5 scale = search_scales(tensor, bits, qw=imatrix) scale = torch.where(scale < 0, torch.clamp(scale, max=-q_scale_thresh), torch.clamp(scale, min=q_scale_thresh)) - int_w =tensor.div_(scale).round_().clamp_(-maxq, maxq - 1) + int_w = tensor.div_(scale).round_().clamp_(-maxq, maxq - 1) qdq_result = (int_w.mul_(scale)).to(tensor.dtype) qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len) return qdq_result, scale, maxq diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 2714806ed..9a885212a 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -480,7 +480,6 @@ def q4_0_quant_block(blocks, scale=None, zp=None, **kwargs): n_blocks = blocks.shape[0] qs = torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(8.5)).clamp_(0, 15).to(torch.uint8) - block_size = GGML_QUANT_SIZES["q4_0"][0] qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy() qs = qs[..., 0, :] | (qs[..., 1, :] << 4) @@ -504,7 +503,6 @@ def q4_1_quant_block(blocks, scale=None, zp=None, **kwargs): qs = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8) - block_size = GGML_QUANT_SIZES["q4_1"][0] qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy() qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) @@ -528,7 +526,13 @@ def q5_0_quant_block(blocks: np.array, scale=None, zp=None, **kwargs): block_size = GGML_QUANT_SIZES["q5_0"][0] # FIXME: Q5_0's reference rounding is cursed and depends on FMA - q = torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(16.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() + q = ( + torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(16.5)) + .clamp_(0, 31) + .to(torch.uint8) + .cpu() + .numpy() + ) qs = q.reshape((n_blocks, 2, block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) @@ -680,8 +684,6 @@ def q3_k_quant_block( nb = blocks.shape[0] blocks = blocks.reshape(nb, QK_K // 16, 16) - - if scale is not None: qdq_scale = scale.reshape(-1, QK_K // 16).to(torch.float32) dq_scale = d_scale.reshape(-1, 1).to(torch.float32) @@ -714,9 +716,9 @@ def q3_k_quant_block( blocks = blocks.reshape((nb, QK_K // 16, 16)) qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32) dq_scale = d_scale.reshape(-1, 1).to(torch.float32) - all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4,3).add_(4).to(torch.uint8) + all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) - q_scales_offset =(qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32) + q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32) output_scale = np.empty((nb, K_SCALE_SIZE), dtype=np.uint8) q_scales_offset = q_scales_offset.cpu().numpy().astype(np.uint8) From c3b9213b5c34af034ef291ddf47acb9c0f32d483 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 13:54:59 +0800 Subject: [PATCH 23/57] Update auto_round/export/export_to_gguf/packing.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/export/export_to_gguf/packing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 9a885212a..3995cdf16 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -770,7 +770,7 @@ def q4_k_quant_block( output_d = max_scales / 63 output_dmin = max_mins / 63 q_scales = (id_scales * scales).round_().clamp_(0, 63).to(torch.uint8) - q_mins = (id_mins * mins).round_().clip(0, 63).to(torch.uint8) + q_mins = (id_mins * mins).round_().clamp_(0, 63).to(torch.uint8) d_tmp = output_d * q_scales dm_tmp = output_dmin * q_mins From c2fe2672a0dff6d132146867672f0b5bc4c8cb1c Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 13:55:45 +0800 Subject: [PATCH 24/57] Update auto_round/export/export_to_gguf/packing.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/export/export_to_gguf/packing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 3995cdf16..8da13260f 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -779,6 +779,7 @@ def q4_k_quant_block( blocks[replace_ids] .add_(dm_tmp[replace_ids].unsqueeze(-1)) .div_(d_tmp[replace_ids].unsqueeze(-1)) + .round_() .clamp_(0, 15) .to(torch.uint8) ) From 963e6f9007f40dfc607ca60e8b8140ccdc993531 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 13:56:32 +0800 Subject: [PATCH 25/57] Update auto_round/export/export_to_gguf/packing.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/export/export_to_gguf/packing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 8da13260f..47f7d24c4 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -886,7 +886,7 @@ def q5_k_quant_block( .div_(d_tmp[replace_ids].unsqueeze(-1)) .round_() .clamp_(0, 31) - .to(torch.int8) + .to(torch.uint8) ) else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq From 4c6366ab28f3da04f17c660040b7d9f86605a329 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 13:57:20 +0800 Subject: [PATCH 26/57] Update auto_round/export/export_to_gguf/packing.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/export/export_to_gguf/packing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 47f7d24c4..6b8a40eb1 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -910,7 +910,7 @@ def q5_k_quant_block( output_d = d_scale.reshape(-1, 1).to(torch.float32) output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) - q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp(0, 63).to(torch.uint8) + q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( blocks.add_(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) From 343dbb6b963f085bc2572546f7f40b148240df96 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 13:57:45 +0800 Subject: [PATCH 27/57] Update auto_round/data_type/gguf.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/data_type/gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index a0eebfb1c..1e9fe3256 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -643,7 +643,8 @@ def iterative_wls_quant_search( def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): if bits == 3: - scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) # TODO split num + # Note: make_q3_quants does not support split_num/chunking; 3-bit quantization is performed in a single chunk. + scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None) elif bits == 6: scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num) From 932f407d5df926c703e78f43e4619e3377f13117 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 14:00:43 +0800 Subject: [PATCH 28/57] Update auto_round/compressors/base.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 48f082f3a..5cb487e2d 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1094,7 +1094,7 @@ def _quantize_embedding_layer(self): # Attempt quantization on GPU, fall back to CPU if OOM try: weight, scale, zp = quant_func( - module.weight.to(dtype).to(self.device), + module.weight.to(dtype=dtype, device=self.device), **{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]}, ) except torch.OutOfMemoryError: From 8304a0255d06f3823339af899a7471eb8e31e2cc Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 14:01:36 +0800 Subject: [PATCH 29/57] Update auto_round/export/export_to_gguf/packing.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/export/export_to_gguf/packing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 6b8a40eb1..c477f3b90 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -956,7 +956,7 @@ def q6_k_quant_block( output_d = d_scale.reshape(-1, 1).to(torch.float32) rd = get_reciprocal(output_d) output_scale = scales.mul(rd).round_().clamp_(max=127).to(torch.int8) - rs = get_reciprocal(scales).unsqueeze_(-1) # inplace unsqueeze + rs = get_reciprocal(scales).unsqueeze_(-1) # unsqueeze for broadcasting all_L = blocks.mul(rs).add_(32).round_().clamp_(0, 63).to(torch.uint8) elif original: scales, all_L = make_qx_quants(blocks, bits=6, rmse_type=1, qw=None) From bc86fdcf4235b4bc1b023f3127c2e8e7c300ec57 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 14:03:40 +0800 Subject: [PATCH 30/57] fix by comments --- auto_round/data_type/gguf.py | 2 +- auto_round/data_type/int.py | 2 +- auto_round/export/export_to_gguf/packing.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index a0eebfb1c..515f5f1af 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -489,7 +489,7 @@ def quant_tensor_gguf_asym_dq( ) inverse_scale = get_reciprocal(scale) - tensor = tensor.add_(wmin) + tensor = tensor+wmin tensor = (tensor.mul_(inverse_scale)).round_().clamp_(0, maxq) tensor = tensor.mul_(scale) tensor = tensor.sub_(wmin).to(orig_dtype) diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py index 8c7b1f261..960a7fc08 100644 --- a/auto_round/data_type/int.py +++ b/auto_round/data_type/int.py @@ -109,7 +109,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5 scale = search_scales(tensor, bits, qw=imatrix) scale = torch.where(scale < 0, torch.clamp(scale, max=-q_scale_thresh), torch.clamp(scale, min=q_scale_thresh)) - int_w = tensor.div_(scale).round_().clamp_(-maxq, maxq - 1) + int_w = tensor.div(scale).round_().clamp_(-maxq, maxq - 1) qdq_result = (int_w.mul_(scale)).to(tensor.dtype) qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len) return qdq_result, scale, maxq diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 9a885212a..198e45476 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -967,8 +967,8 @@ def q6_k_quant_block( output_scale = (iscales * scales).round_().clamp_(max=127).to(torch.int8) d_tmp = output_d * output_scale.to(torch.float32) replace_ids = d_tmp != 0 - all_L[replace_ids] = ( - (blocks[replace_ids] / d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8) + all_L[replace_ids] = (blocks[replace_ids].div_(d_tmp[replace_ids]). + reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8) ) else: from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq From 033330e7f335b384257b06a89f3acd7af945d84f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Nov 2025 06:04:30 +0000 Subject: [PATCH 31/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/data_type/gguf.py | 2 +- auto_round/export/export_to_gguf/packing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 317721189..aa7eb6447 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -489,7 +489,7 @@ def quant_tensor_gguf_asym_dq( ) inverse_scale = get_reciprocal(scale) - tensor = tensor+wmin + tensor = tensor + wmin tensor = (tensor.mul_(inverse_scale)).round_().clamp_(0, maxq) tensor = tensor.mul_(scale) tensor = tensor.sub_(wmin).to(orig_dtype) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index b62227639..1f3b3a1f3 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -968,8 +968,8 @@ def q6_k_quant_block( output_scale = (iscales * scales).round_().clamp_(max=127).to(torch.int8) d_tmp = output_d * output_scale.to(torch.float32) replace_ids = d_tmp != 0 - all_L[replace_ids] = (blocks[replace_ids].div_(d_tmp[replace_ids]). - reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8) + all_L[replace_ids] = ( + blocks[replace_ids].div_(d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8) ) else: from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq From c905bd27cc22786d6dde260bea6cee8ead632e2f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 14:13:44 +0800 Subject: [PATCH 32/57] fix line too long --- auto_round/data_type/gguf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index aa7eb6447..4007b1d72 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -643,9 +643,10 @@ def iterative_wls_quant_search( def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num): if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0): if bits == 3: - # Note: make_q3_quants does not support split_num/chunking; 3-bit quantization is performed in a single chunk. + # Note: make_q3_quants does not support split_num/chunking; + # 3-bit quantization is performed in a single chunk. scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) - ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None) + # scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None) elif bits == 6: scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num) else: From a61657937c75013f5810e38cc0e4ee47758cd6d8 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 14:20:09 +0800 Subject: [PATCH 33/57] update readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 367a08c7c..f0b0ebb25 100644 --- a/README.md +++ b/README.md @@ -184,14 +184,14 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round") Important Hyperparameters ##### Quantization Scheme & Configuration -- **`scheme` (str|dict|AutoScheme)**: The predefined quantization keys, e.g. `W4A16`, `MXFP4`, `NVFP4`, `GGUF:Q4_K_M`. +- **`scheme` (str|dict|AutoScheme)**: The predefined quantization keys, e.g. `W4A16`, `MXFP4`, `NVFP4`, `GGUF:Q4_K_M`. For MXFP4/NVFP4, we recommend exporting to LLM-Compressor format. - **`bits` (int)**: Number of bits for quantization (default is `None`). If not None, it will override the scheme setting. - **`group_size` (int)**: Size of the quantization group (default is `None`). If not None, it will override the scheme setting. - **`sym` (bool)**: Whether to use symmetric quantization (default is `None`). If not None, it will override the scheme setting. -- **`layer_config` (dict)**: Configuration for weight quantization (default is `None`), mainly for mixed schemes. +- **`layer_config` (dict)**: Configuration for layer_wise scheme (default is `None`), mainly for customized mixed schemes. ##### Algorithm Settings -- **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0` Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`. +- **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0`. Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`. - **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled). ##### Tuning Process Parameters From cd01f1320a2a5a2fb34203c8ac40ee215b9a534d Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 19 Nov 2025 15:43:15 +0800 Subject: [PATCH 34/57] update --- auto_round/compressors/base.py | 2 +- auto_round/export/export_to_gguf/convert.py | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 5cb487e2d..ada41fa65 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1362,7 +1362,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None ) - self._quantize_embedding_layer() + # self._quantize_embedding_layer() self.model.to("cpu") # Release memory diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index c075bbe7c..04911292a 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -190,7 +190,7 @@ def is_extra_tensor(tensor_name): def _quant_data_with_args(data_torch, data_qtype, scale, zp, d_scale=None, wmin=None, d_wmin=None, imatrix=None): - device = get_packing_device() + device = data_torch.device data_torch = data_torch.to(torch.float32) scale = scale.to(torch.float32) if isinstance(scale, torch.Tensor) else scale zp = zp.to(torch.float32) if isinstance(zp, torch.Tensor) else zp @@ -215,7 +215,7 @@ def _quant_data_with_args(data_torch, data_qtype, scale, zp, d_scale=None, wmin= def _quant_data(cls, data_torch, data_qtype, name, modify_name, bid): suffix = ".weight" - device = get_packing_device() + device = data_torch.device if suffix in name: layer_name = name[: -len(suffix)] module = get_module(cls.model, layer_name) @@ -406,9 +406,10 @@ def prepare_tensors(cls): modify_name = _special_name_handle(cls, name) orig_device = data_torch.device - data_torch = data_torch.to("cpu") + import psutil, os + process = psutil.Process(os.getpid()) + print(f"CPU RAM: {process.memory_info().rss / 1024 ** 2:.2f} MB") for new_name, data_torch in cls.modify_tensors(data_torch, modify_name, bid): - data_torch.to(orig_device) skip = False for tensor_info in cls.gguf_writer.tensors: if new_name in tensor_info: @@ -417,12 +418,7 @@ def prepare_tensors(cls): break if skip: continue - data = data_torch.squeeze().cpu().numpy() - - # if data ends up empty, it means data_torch was a scalar tensor -> restore - if len(data.shape) == 0: - data = data_torch.numpy() - + data = data_torch.squeeze() n_dims = len(data.shape) data_qtype: gguf.GGMLQuantizationType | bool = cls.tensor_force_quant(name, new_name, bid, n_dims) @@ -537,6 +533,11 @@ def prepare_tensors(cls): gguf.GGMLQuantizationType.BF16, gguf.GGMLQuantizationType.F32, ]: + data = data_torch.squeeze().cpu().numpy() + + # if data ends up empty, it means data_torch was a scalar tensor -> restore + if len(data.shape) == 0: + data = data_torch.numpy() try: data = gguf.quants.quantize(data, data_qtype) except gguf.QuantError as e: From 876267e2fef4498a89546fa5e6b2176f8583cf4f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Nov 2025 07:44:03 +0000 Subject: [PATCH 35/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_gguf/convert.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 04911292a..c7bbe25fc 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -406,7 +406,10 @@ def prepare_tensors(cls): modify_name = _special_name_handle(cls, name) orig_device = data_torch.device - import psutil, os + import os + + import psutil + process = psutil.Process(os.getpid()) print(f"CPU RAM: {process.memory_info().rss / 1024 ** 2:.2f} MB") for new_name, data_torch in cls.modify_tensors(data_torch, modify_name, bid): From 1138c737993c985f9493622a86e74c1c98cd8ca9 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 20 Nov 2025 10:31:02 +0800 Subject: [PATCH 36/57] clean code --- auto_round/export/export_to_gguf/convert.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 04911292a..572e4bb6f 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -406,9 +406,6 @@ def prepare_tensors(cls): modify_name = _special_name_handle(cls, name) orig_device = data_torch.device - import psutil, os - process = psutil.Process(os.getpid()) - print(f"CPU RAM: {process.memory_info().rss / 1024 ** 2:.2f} MB") for new_name, data_torch in cls.modify_tensors(data_torch, modify_name, bid): skip = False for tensor_info in cls.gguf_writer.tensors: From be5e13cf6a4deacc17ae7d7487d7d97e0c1e2908 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 20 Nov 2025 16:53:59 +0800 Subject: [PATCH 37/57] update --- auto_round/export/export_to_gguf/packing.py | 99 +++++++++++---------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 1f3b3a1f3..cd7d9f32f 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -52,53 +52,62 @@ def ggml_quant( shape = data.shape n_blocks = data.nelement() // block_size - split_num = 1 - for dim in data.shape: - if dim > 100_000: - split_num = 16 - break - + split_num = 16 if max(data.shape) > 100_000 else 1 blocks = data.reshape((n_blocks, block_size)) quant_func = GGML_QUANT_TYPE[ggml_type] - try: - new_data = quant_func( - blocks, - scale, - zp=zp, - wmin=wmin, - d_scale=d_scale, - d_wmin=d_wmin, - imatrix=imatrix, - original=original, - split_num=split_num, - ) - except torch.OutOfMemoryError: - orig_device = blocks.device - device = "cpu" - blocks = blocks.to(device) - scale = scale.to(device) if scale is not None else scale - zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp - wmin = wmin.to(device) if wmin is not None else wmin - d_scale = d_scale.to(device) if d_scale is not None else d_scale - d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin - imatrix = imatrix.to(device) if imatrix is not None else imatrix - clear_memory(device_list=orig_device) - new_data = quant_func( - blocks, - scale, - zp=zp, - wmin=wmin, - d_scale=d_scale, - d_wmin=d_wmin, - imatrix=imatrix, - original=original, - split_num=split_num, - ) - - assert new_data.shape[-1] == type_size - new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) - new_data = new_data.reshape(*shape[:-1], -1) - return new_data + results = [] + for i in range(split_num): + if split_num > 1: + start = (n_blocks * i) // split_num + end = (n_blocks * (i + 1)) // split_num + blocks = data.reshape((n_blocks, block_size))[start:end] + scale = scale[start:end] if scale is not None else scale + zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp + wmin = wmin[start:end] if wmin is not None else wmin + d_scale = d_scale[start:end] if d_scale is not None else d_scale + d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin + # imatrix = imatrix[start:end] if imatrix is not None else imatrix + try: + new_data = quant_func( + blocks, + scale, + zp=zp, + wmin=wmin, + d_scale=d_scale, + d_wmin=d_wmin, + imatrix=imatrix, + original=original, + ) + except torch.OutOfMemoryError: + orig_device = blocks.device + device = "cpu" + blocks = blocks.to(device) + scale = scale.to(device) if scale is not None else scale + zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp + wmin = wmin.to(device) if wmin is not None else wmin + d_scale = d_scale.to(device) if d_scale is not None else d_scale + d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin + imatrix = imatrix.to(device) if imatrix is not None else imatrix + clear_memory(device_list=orig_device) + new_data = quant_func( + blocks, + scale, + zp=zp, + wmin=wmin, + d_scale=d_scale, + d_wmin=d_wmin, + imatrix=imatrix, + original=original + ) + + assert new_data.shape[-1] == type_size + new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) + new_data = new_data.reshape(*shape[:-1], -1) + results.append(new_data) + if len(results)==1: + return results[0] + else: + return torch.cat(results, dim=0) def torch_roundf(n): From 190ea033e354408de295cb2dfcc82b07fb735fbb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 08:56:43 +0000 Subject: [PATCH 38/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_gguf/packing.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index cd7d9f32f..773b1fcf8 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -90,21 +90,14 @@ def ggml_quant( imatrix = imatrix.to(device) if imatrix is not None else imatrix clear_memory(device_list=orig_device) new_data = quant_func( - blocks, - scale, - zp=zp, - wmin=wmin, - d_scale=d_scale, - d_wmin=d_wmin, - imatrix=imatrix, - original=original + blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original ) assert new_data.shape[-1] == type_size new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) new_data = new_data.reshape(*shape[:-1], -1) results.append(new_data) - if len(results)==1: + if len(results) == 1: return results[0] else: return torch.cat(results, dim=0) From f16cde5c010b6e403e9ccea14b240d10713de720 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 20 Nov 2025 17:40:05 +0800 Subject: [PATCH 39/57] fix typo --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 9dc931ecf..719b45d36 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2851,7 +2851,7 @@ def _quantize_block( if auto_offload: mv_module_from_gpu(block) - clear_memory(input_ids,device_list=self.device_list) + clear_memory(input_ids, device_list=self.device_list) memory_info_summary = memory_monitor.get_summary() logger.infoclean(dump_info + "," + memory_info_summary) From 9f408d15ac120aac766a91390100a48fd0075c46 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 20 Nov 2025 17:57:36 +0800 Subject: [PATCH 40/57] update --- auto_round/data_type/gguf.py | 27 +++----- auto_round/export/export_to_gguf/packing.py | 72 +++++++-------------- auto_round/utils/device.py | 4 +- 3 files changed, 35 insertions(+), 68 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 4007b1d72..f96138892 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -452,7 +452,6 @@ def quant_tensor_gguf_asym_dq( wmin=None, d_scale=None, d_wmin=None, - split_num=None, **kwargs, ): """Quantizes and dequantizes a tensor using asymmetric integer quantization for formats like Q2_K, Q4_K, and Q5_K. @@ -473,12 +472,7 @@ def quant_tensor_gguf_asym_dq( orig_dtype = tensor.dtype maxq = 2**bits - 1 group_size = 16 if bits == 2 else 32 - if split_num is None: - split_num = 1 - for dim in tensor.shape: - if dim > 100_000: - split_num = 16 - break + split_num=1 tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) @@ -499,7 +493,7 @@ def quant_tensor_gguf_asym_dq( # TODO consolidate iterative_wls_quant_search_chunk and non-chunk def iterative_wls_quant_search_chunk( - data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8 + data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=1 ): dtype = torch.float32 data = data.to(dtype) @@ -602,9 +596,11 @@ def iterative_wls_quant_search_chunk( results_rmin.append(-rmin.to(torch.float32)) if split_num > 1: - clear_memory(device_list=[data.device]) - - return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0) + clear_memory(device_list=data.device) + if len(results_scale)>1: + return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0) + else: + return results_scale[0], results_rmin[0] def iterative_wls_quant_search( @@ -671,7 +667,7 @@ def quant_tensor_gguf_sym_dq( scale=None, d_scale=None, scale_dtype=torch.float16, - split_num=None, + split_num=1, **kwargs, ): """Quantize and de-quantize tensor asymmetrically. For Q3_K, Q6_K. @@ -698,13 +694,6 @@ def quant_tensor_gguf_sym_dq( maxq = 2 ** (bits - 1) group_size = 16 - if split_num is None: - split_num = 1 - for dim in tensor.shape: - if dim > 100_000: - split_num = 16 - break - tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) orig_dtype = tensor.dtype super_bits = 6 if bits == 3 else 8 diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 773b1fcf8..76713b538 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -56,6 +56,7 @@ def ggml_quant( blocks = data.reshape((n_blocks, block_size)) quant_func = GGML_QUANT_TYPE[ggml_type] results = [] + orig_device = blocks.device for i in range(split_num): if split_num > 1: start = (n_blocks * i) // split_num @@ -66,6 +67,7 @@ def ggml_quant( wmin = wmin[start:end] if wmin is not None else wmin d_scale = d_scale[start:end] if d_scale is not None else d_scale d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin + shape = data.shape # imatrix = imatrix[start:end] if imatrix is not None else imatrix try: new_data = quant_func( @@ -79,7 +81,6 @@ def ggml_quant( original=original, ) except torch.OutOfMemoryError: - orig_device = blocks.device device = "cpu" blocks = blocks.to(device) scale = scale.to(device) if scale is not None else scale @@ -92,15 +93,18 @@ def ggml_quant( new_data = quant_func( blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original ) - - assert new_data.shape[-1] == type_size - new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) - new_data = new_data.reshape(*shape[:-1], -1) results.append(new_data) + if split_num>1: + print(f"!!!!!!!!!!!!!{orig_device}!!!!!!!!!!!!!!!!!!!!",flush=True) + clear_memory(device_list=orig_device) + if len(results) == 1: - return results[0] + new_data= results[0] else: - return torch.cat(results, dim=0) + new_data = np.concatenate(results, axis=0) + new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) + new_data = new_data.reshape(*shape[:-1], -1) + return new_data def torch_roundf(n): @@ -199,9 +203,12 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1): L_list.append(L.to(torch.uint8)) # Concatenate all chunks along batch dimension - scales = torch.cat(scales_list, dim=0) - L = torch.cat(L_list, dim=0) - return scales, L + if len(scales_list)>1: + scales = torch.cat(scales_list, dim=0) + L = torch.cat(L_list, dim=0) + return scales, L + else: + return scales, L def make_qx_quants(data, bits, rmse_type=0, qw=None): @@ -593,7 +600,7 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray: @register_qtype("q2_k") def q2_k_quant_block( - blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs + blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs ): nb = blocks.shape[0] device = blocks.device @@ -645,18 +652,10 @@ def q2_k_quant_block( blocks.reshape(blocks.shape[0], -1) blocks, scales, mins = quant_tensor_gguf_asym_dq( - blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] - if split_num is not None and split_num > 1: - blocks = blocks.to("cpu") - scales = scales.to("cpu") - d_scale = d_scale.to("cpu") - mins = mins.to("cpu") - d_wmin = d_wmin.to("cpu") - clear_memory(device_list=[device]) - blocks = blocks.reshape((nb, QK_K // 16, 16)) scales = scales.reshape((-1, QK_K // 16)) mins = mins.reshape((-1, QK_K // 16)) @@ -681,7 +680,7 @@ def q2_k_quant_block( @register_qtype("q3_k") def q3_k_quant_block( - blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs + blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs ): nb = blocks.shape[0] blocks = blocks.reshape(nb, QK_K // 16, 16) @@ -707,13 +706,9 @@ def q3_k_quant_block( blocks = blocks.reshape(blocks.shape[0], -1) blocks, scales, _ = quant_tensor_gguf_sym_dq( - blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix ) scales, d_scale = scales["scale"], scales["d_scale"] - if split_num is not None and split_num > 1: - blocks = blocks.to("cpu") - scales = scales.to("cpu") - d_scale = d_scale.to("cpu") blocks = blocks.reshape((nb, QK_K // 16, 16)) qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32) @@ -791,16 +786,10 @@ def q4_k_quant_block( blocks.reshape(blocks.shape[0], -1) blocks, scales, mins = quant_tensor_gguf_asym_dq( - blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] - if split_num is not None and split_num > 1: - blocks = blocks.to("cpu") - scales = scales.to("cpu") - d_scale = d_scale.to("cpu") - mins = mins.to("cpu") - d_wmin = d_wmin.to("cpu") blocks = blocks.reshape((nb, QK_K // 32, 32)) scales = scales.reshape((-1, QK_K // 32)) @@ -847,7 +836,6 @@ def q5_k_quant_block( d_wmin=None, imatrix=None, original=False, - split_num=1, **kwargs, ): nb = blocks.shape[0] @@ -895,16 +883,10 @@ def q5_k_quant_block( blocks.reshape(blocks.shape[0], -1) blocks, scales, mins = quant_tensor_gguf_asym_dq( - blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix ) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] - if split_num is not None and split_num > 1: - blocks = blocks.to("cpu") - scales = scales.to("cpu") - d_scale = d_scale.to("cpu") - mins = mins.to("cpu") - d_wmin = d_wmin.to("cpu") blocks = blocks.reshape((nb, QK_K // 32, 32)) scales = scales.reshape((-1, QK_K // 32)) @@ -948,7 +930,7 @@ def q5_k_quant_block( @register_qtype("q6_k") def q6_k_quant_block( - blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs + blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 16, 16)) @@ -978,13 +960,9 @@ def q6_k_quant_block( blocks = blocks.reshape(blocks.shape[0], -1) blocks, scales, _ = quant_tensor_gguf_sym_dq( - blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num + blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix ) scales, d_scale = scales["scale"], scales["d_scale"] - if split_num is not None and split_num > 1: - blocks = blocks.to("cpu") - scales = scales.to("cpu") - d_scale = d_scale.to("cpu") blocks = blocks.reshape((nb, QK_K // 16, 16)) scales = scales.reshape((-1, QK_K // 16)) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 496f399bb..1db1fce69 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -1367,7 +1367,7 @@ def update(self, device_list=None): process = psutil.Process() current_ram = process.memory_info().rss / 1024**3 # GB self.peak_ram = max(self.peak_ram, current_ram) - if device_list is None: # TODO this have issue, wait for clean memory all pass device_list + if device_list is None: # TODO this has issue, wait for clean_memory all pass device_list device_list = [0] if device_list is not None: if not isinstance(device_list, (list, tuple)): @@ -1379,7 +1379,7 @@ def update(self, device_list=None): device_list = list(range(torch.xpu.device_count())) for device in device_list: - if device == "cpu": + if str(device) == "cpu": continue if torch.cuda.is_available(): current_vram = torch.cuda.memory_reserved(device) / 1024**3 # GB From 2d059f3f8f27eb8cb238bfd42e77e76673da3f1e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 09:58:53 +0000 Subject: [PATCH 41/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/data_type/gguf.py | 4 +- auto_round/export/export_to_gguf/packing.py | 42 +++++++-------------- 2 files changed, 15 insertions(+), 31 deletions(-) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index f96138892..27a97f010 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -472,7 +472,7 @@ def quant_tensor_gguf_asym_dq( orig_dtype = tensor.dtype maxq = 2**bits - 1 group_size = 16 if bits == 2 else 32 - split_num=1 + split_num = 1 tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size) @@ -597,7 +597,7 @@ def iterative_wls_quant_search_chunk( if split_num > 1: clear_memory(device_list=data.device) - if len(results_scale)>1: + if len(results_scale) > 1: return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0) else: return results_scale[0], results_rmin[0] diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 76713b538..cdfe74eed 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -94,14 +94,14 @@ def ggml_quant( blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original ) results.append(new_data) - if split_num>1: - print(f"!!!!!!!!!!!!!{orig_device}!!!!!!!!!!!!!!!!!!!!",flush=True) + if split_num > 1: + print(f"!!!!!!!!!!!!!{orig_device}!!!!!!!!!!!!!!!!!!!!", flush=True) clear_memory(device_list=orig_device) if len(results) == 1: - new_data= results[0] + new_data = results[0] else: - new_data = np.concatenate(results, axis=0) + new_data = np.concatenate(results, axis=0) new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) new_data = new_data.reshape(*shape[:-1], -1) return new_data @@ -203,7 +203,7 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1): L_list.append(L.to(torch.uint8)) # Concatenate all chunks along batch dimension - if len(scales_list)>1: + if len(scales_list) > 1: scales = torch.cat(scales_list, dim=0) L = torch.cat(L_list, dim=0) return scales, L @@ -599,9 +599,7 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray: @register_qtype("q2_k") -def q2_k_quant_block( - blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs -): +def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs): nb = blocks.shape[0] device = blocks.device blocks = blocks.reshape((nb, QK_K // 16, 16)) # (nb, 16, 16) @@ -651,9 +649,7 @@ def q2_k_quant_block( from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq( - blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix - ) + blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] blocks = blocks.reshape((nb, QK_K // 16, 16)) @@ -679,9 +675,7 @@ def q2_k_quant_block( @register_qtype("q3_k") -def q3_k_quant_block( - blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs -): +def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs): nb = blocks.shape[0] blocks = blocks.reshape(nb, QK_K // 16, 16) @@ -705,9 +699,7 @@ def q3_k_quant_block( from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq blocks = blocks.reshape(blocks.shape[0], -1) - blocks, scales, _ = quant_tensor_gguf_sym_dq( - blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix - ) + blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix) scales, d_scale = scales["scale"], scales["d_scale"] blocks = blocks.reshape((nb, QK_K // 16, 16)) @@ -785,9 +777,7 @@ def q4_k_quant_block( from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq( - blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix - ) + blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] @@ -882,9 +872,7 @@ def q5_k_quant_block( from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks.reshape(blocks.shape[0], -1) - blocks, scales, mins = quant_tensor_gguf_asym_dq( - blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix - ) + blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] @@ -929,9 +917,7 @@ def q5_k_quant_block( @register_qtype("q6_k") -def q6_k_quant_block( - blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs -): +def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 16, 16)) device = blocks.device @@ -959,9 +945,7 @@ def q6_k_quant_block( from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq blocks = blocks.reshape(blocks.shape[0], -1) - blocks, scales, _ = quant_tensor_gguf_sym_dq( - blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix - ) + blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix) scales, d_scale = scales["scale"], scales["d_scale"] blocks = blocks.reshape((nb, QK_K // 16, 16)) From 78499a76523ae0a56f8f17a1bba25c23c94f9dd1 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 20 Nov 2025 19:30:48 +0800 Subject: [PATCH 42/57] try to fix ut failure --- auto_round/export/export_to_gguf/packing.py | 29 ++++++++++----------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index cdfe74eed..869bc1eb8 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -95,7 +95,6 @@ def ggml_quant( ) results.append(new_data) if split_num > 1: - print(f"!!!!!!!!!!!!!{orig_device}!!!!!!!!!!!!!!!!!!!!", flush=True) clear_memory(device_list=orig_device) if len(results) == 1: @@ -510,7 +509,7 @@ def q4_1_quant_block(blocks, scale=None, zp=None, **kwargs): id = get_reciprocal(d) n_blocks = blocks.shape[0] - qs = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8) + qs = torch.trunc(blocks.sub(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8) block_size = GGML_QUANT_SIZES["q4_1"][0] qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy() @@ -567,7 +566,7 @@ def q5_1_quant_block(blocks: np.array, scale=None, zp=None, **kwargs): block_size = GGML_QUANT_SIZES["q5_1"][0] id = get_reciprocal(d) - q = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() + q = torch.trunc(blocks.sub(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() qs = q.reshape((n_blocks, 2, block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) @@ -587,7 +586,7 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray: else: d = torch.abs(blocks).max(dim=1, keepdim=True)[0] / 127 id = get_reciprocal(d) - blocks = blocks.mul_(id) + blocks = blocks.mul(id) qs = torch_roundf(blocks).clamp_(-128, 127) # (n_blocks, 2) @@ -611,7 +610,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 - all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) + all_L = blocks.add(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True) max_scales = torch.max(scales, dim=-1, keepdim=True)[0] @@ -659,7 +658,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) output_scale = scales.mul(get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 - all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) + all_L = blocks.add(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) output_scale = output_scale.cpu().numpy() all_L = all_L.reshape(-1, 4, 32) @@ -682,7 +681,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, if scale is not None: qdq_scale = scale.reshape(-1, QK_K // 16).to(torch.float32) dq_scale = d_scale.reshape(-1, 1).to(torch.float32) - all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) + all_L = blocks.mul(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32) elif original: scales, _ = make_q3_quants(blocks, bits=3, do_rmse=True) @@ -693,7 +692,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, qscale = (inverse_dq_scale * scales).round_().clamp_(-32, 31) qdq_scale = dq_scale.to(torch.float32) * qscale reverse_qdq_scale = get_reciprocal(qdq_scale) - all_L = blocks.mul_(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8) + all_L = blocks.mul(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8) q_scales_offset = (qdq_scale * inverse_dq_scale).round_().clamp_(-32, 31).add_(32) else: from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq @@ -705,7 +704,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, blocks = blocks.reshape((nb, QK_K // 16, 16)) qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32) dq_scale = d_scale.reshape(-1, 1).to(torch.float32) - all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) + all_L = blocks.mul(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32) @@ -743,7 +742,7 @@ def q4_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add_(mins.unsqueeze(-1)) + blocks.add(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() .clamp_(0, 15) @@ -789,7 +788,7 @@ def q4_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add_(mins.unsqueeze(-1)) + blocks.add(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() .clamp_(0, 15) @@ -839,7 +838,7 @@ def q5_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add_(mins.unsqueeze(-1)) + blocks.add(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() .clamp_(0, 31) @@ -884,7 +883,7 @@ def q5_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add_(mins.unsqueeze(-1)) + blocks.add(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() .clamp_(0, 31) @@ -939,7 +938,7 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, d_tmp = output_d * output_scale.to(torch.float32) replace_ids = d_tmp != 0 all_L[replace_ids] = ( - blocks[replace_ids].div_(d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8) + blocks[replace_ids].div(d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8) ) else: from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq @@ -952,7 +951,7 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, scales = scales.reshape((-1, QK_K // 16)) output_d = d_scale.reshape(-1, 1).to(torch.float32) output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(max=127).to(torch.int8) - all_L = blocks.mul_(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8) + all_L = blocks.mul(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8) tmp_L = all_L.reshape(nb, 4, 64) & 0xF output_ql = (tmp_L[:, ::2] | (tmp_L[:, 1::2] << 4)).reshape(nb, QK_K // 2).cpu().numpy().astype(np.uint8) From 575103fc8c6fa615c3a0c0de7422a570374458cb Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 20 Nov 2025 21:20:48 +0800 Subject: [PATCH 43/57] try to fix ut failure --- auto_round/compressors/base.py | 5 +- auto_round/export/export_to_gguf/packing.py | 157 ++++++++++---------- 2 files changed, 84 insertions(+), 78 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 719b45d36..2ccb22486 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -714,9 +714,8 @@ def _check_compatibility(self) -> None: if has_gguf and self.iters != 0 and self.bits != 3 and not self.enable_alg_ext: logger.warning( "`iters=0` is recommended when exporting to GGUF format except for bits 3," - " as we have optimized the RTN method for this case." - " Or add enable_alg_ext to use the new algorithm," - " refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md" + " or add `enable_alg_ext` for better accuracy with much more tuning cost" + " Please refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md" " to check the acc." ) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 869bc1eb8..9158f3742 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -29,73 +29,80 @@ def register(cls): return register +def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original): + try: + new_data = quant_func( + blocks, + scale, + zp=zp, + wmin=wmin, + d_scale=d_scale, + d_wmin=d_wmin, + imatrix=imatrix, + original=original, + ) + except torch.OutOfMemoryError: + device = "cpu" + blocks = blocks.to(device) + scale = scale.to(device) if scale is not None else scale + zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp + wmin = wmin.to(device) if wmin is not None else wmin + d_scale = d_scale.to(device) if d_scale is not None else d_scale + d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin + imatrix = imatrix.to(device) if imatrix is not None else imatrix + clear_memory(device_list=orig_device) + new_data = quant_func( + blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original + ) + return new_data + + def ggml_quant( - data, - ggml_type, - scale=None, - zp=None, - wmin=None, - d_scale=None, - d_wmin=None, - imatrix=None, - device="cuda", - original=False, + data, + ggml_type, + scale=None, + zp=None, + wmin=None, + d_scale=None, + d_wmin=None, + imatrix=None, + device="cuda", + original=False, ): block_size, type_size = GGML_QUANT_SIZES[ggml_type] - data = data.to(torch.float32).to(device) - scale = scale.to(device) if scale is not None else scale - zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp - wmin = wmin.to(device) if wmin is not None else wmin - d_scale = d_scale.to(device) if d_scale is not None else d_scale - d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin - shape = data.shape n_blocks = data.nelement() // block_size split_num = 16 if max(data.shape) > 100_000 else 1 blocks = data.reshape((n_blocks, block_size)) + scale = scale.to(device).reshape(blocks.shape[0],-1) if scale is not None else scale + zp = zp.to(device).reshape(blocks.shape[0],-1) if zp is not None and isinstance(zp, torch.Tensor) else zp + wmin = wmin.to(device).reshape(blocks.shape[0],-1) if wmin is not None else wmin + d_scale = d_scale.to(device).reshape(blocks.shape[0],-1) if d_scale is not None else d_scale + d_wmin = d_wmin.to(device).reshape(blocks.shape[0],-1) if d_wmin is not None else d_wmin + quant_func = GGML_QUANT_TYPE[ggml_type] results = [] - orig_device = blocks.device + chunk_size = (n_blocks + split_num - 1) // split_num for i in range(split_num): if split_num > 1: - start = (n_blocks * i) // split_num - end = (n_blocks * (i + 1)) // split_num - blocks = data.reshape((n_blocks, block_size))[start:end] - scale = scale[start:end] if scale is not None else scale - zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp - wmin = wmin[start:end] if wmin is not None else wmin - d_scale = d_scale[start:end] if d_scale is not None else d_scale - d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin + start = chunk_size * i + end = chunk_size * (i + 1) + tmp_blocks = blocks[start:end] + tmp_scale = scale[start:end] if scale is not None else scale + tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp + tmp_wmin = wmin[start:end] if wmin is not None else wmin + tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale + tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin shape = data.shape + new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, + imatrix, original) # imatrix = imatrix[start:end] if imatrix is not None else imatrix - try: - new_data = quant_func( - blocks, - scale, - zp=zp, - wmin=wmin, - d_scale=d_scale, - d_wmin=d_wmin, - imatrix=imatrix, - original=original, - ) - except torch.OutOfMemoryError: - device = "cpu" - blocks = blocks.to(device) - scale = scale.to(device) if scale is not None else scale - zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp - wmin = wmin.to(device) if wmin is not None else wmin - d_scale = d_scale.to(device) if d_scale is not None else d_scale - d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin - imatrix = imatrix.to(device) if imatrix is not None else imatrix - clear_memory(device_list=orig_device) - new_data = quant_func( - blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original - ) + else: + new_data = ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original) results.append(new_data) if split_num > 1: - clear_memory(device_list=orig_device) + clear_memory(device_list=device) if len(results) == 1: new_data = results[0] @@ -509,7 +516,7 @@ def q4_1_quant_block(blocks, scale=None, zp=None, **kwargs): id = get_reciprocal(d) n_blocks = blocks.shape[0] - qs = torch.trunc(blocks.sub(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8) + qs = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8) block_size = GGML_QUANT_SIZES["q4_1"][0] qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy() @@ -566,7 +573,7 @@ def q5_1_quant_block(blocks: np.array, scale=None, zp=None, **kwargs): block_size = GGML_QUANT_SIZES["q5_1"][0] id = get_reciprocal(d) - q = torch.trunc(blocks.sub(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() + q = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy() qs = q.reshape((n_blocks, 2, block_size // 2)) qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) @@ -610,7 +617,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 - all_L = blocks.add(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) + all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) elif original: scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True) max_scales = torch.max(scales, dim=-1, keepdim=True)[0] @@ -628,7 +635,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i replace_ids = (max_mins > 0).squeeze() output_scale[replace_ids] |= ( - torch.round(id_mins[replace_ids] * mins[replace_ids]).clip(0, 15).to(torch.uint8) << 4 + torch.round(id_mins[replace_ids] * mins[replace_ids]).clip(0, 15).to(torch.uint8) << 4 ) d_tmp = output_d * (output_scale & 0xF) @@ -658,7 +665,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i output_dmin = d_wmin.reshape(-1, 1).to(torch.float32) output_scale = scales.mul(get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8) output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4 - all_L = blocks.add(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) + all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8) output_scale = output_scale.cpu().numpy() all_L = all_L.reshape(-1, 4, 32) @@ -692,7 +699,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, qscale = (inverse_dq_scale * scales).round_().clamp_(-32, 31) qdq_scale = dq_scale.to(torch.float32) * qscale reverse_qdq_scale = get_reciprocal(qdq_scale) - all_L = blocks.mul(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8) + all_L = blocks.mul_(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8) q_scales_offset = (qdq_scale * inverse_dq_scale).round_().clamp_(-32, 31).add_(32) else: from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq @@ -704,7 +711,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, blocks = blocks.reshape((nb, QK_K // 16, 16)) qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32) dq_scale = d_scale.reshape(-1, 1).to(torch.float32) - all_L = blocks.mul(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) + all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8) q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32) @@ -729,7 +736,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, @register_qtype("q4_k") def q4_k_quant_block( - blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs + blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) @@ -742,7 +749,7 @@ def q4_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add(mins.unsqueeze(-1)) + blocks.add_(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() .clamp_(0, 15) @@ -788,7 +795,7 @@ def q4_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add(mins.unsqueeze(-1)) + blocks.add_(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() .clamp_(0, 15) @@ -817,15 +824,15 @@ def q4_k_quant_block( @register_qtype("q5_k") def q5_k_quant_block( - blocks, - scale=None, - zp=None, - wmin=None, - d_scale=None, - d_wmin=None, - imatrix=None, - original=False, - **kwargs, + blocks, + scale=None, + zp=None, + wmin=None, + d_scale=None, + d_wmin=None, + imatrix=None, + original=False, + **kwargs, ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) @@ -838,7 +845,7 @@ def q5_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add(mins.unsqueeze(-1)) + blocks.add_(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() .clamp_(0, 31) @@ -860,7 +867,7 @@ def q5_k_quant_block( dm_tmp = output_dmin * q_mins replace_ids = d_tmp != 0 all_L[replace_ids] = ( - blocks[replace_ids] + blocks_[replace_ids] .add_(dm_tmp[replace_ids].unsqueeze(-1)) .div_(d_tmp[replace_ids].unsqueeze(-1)) .round_() @@ -883,7 +890,7 @@ def q5_k_quant_block( q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8) q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8) all_L = ( - blocks.add(mins.unsqueeze(-1)) + blocks.add_(mins.unsqueeze(-1)) .mul_(get_reciprocal(scales.unsqueeze(-1))) .round_() .clamp_(0, 31) @@ -926,7 +933,7 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, rd = get_reciprocal(output_d) output_scale = scales.mul(rd).round_().clamp_(max=127).to(torch.int8) rs = get_reciprocal(scales).unsqueeze_(-1) # unsqueeze for broadcasting - all_L = blocks.mul(rs).add_(32).round_().clamp_(0, 63).to(torch.uint8) + all_L = blocks.mul_(rs).add_(32).round_().clamp_(0, 63).to(torch.uint8) elif original: scales, all_L = make_qx_quants(blocks, bits=6, rmse_type=1, qw=None) imax = abs(scales).argmax(dim=-1, keepdim=True) @@ -951,7 +958,7 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, scales = scales.reshape((-1, QK_K // 16)) output_d = d_scale.reshape(-1, 1).to(torch.float32) output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(max=127).to(torch.int8) - all_L = blocks.mul(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8) + all_L = blocks.mul_(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8) tmp_L = all_L.reshape(nb, 4, 64) & 0xF output_ql = (tmp_L[:, ::2] | (tmp_L[:, 1::2] << 4)).reshape(nb, QK_K // 2).cpu().numpy().astype(np.uint8) From 55efbbca4ecba9a97274881780846436f744b417 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:25:42 +0000 Subject: [PATCH 44/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_gguf/packing.py | 57 +++++++++++---------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 9158f3742..9d431898d 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -58,16 +58,16 @@ def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatri def ggml_quant( - data, - ggml_type, - scale=None, - zp=None, - wmin=None, - d_scale=None, - d_wmin=None, - imatrix=None, - device="cuda", - original=False, + data, + ggml_type, + scale=None, + zp=None, + wmin=None, + d_scale=None, + d_wmin=None, + imatrix=None, + device="cuda", + original=False, ): block_size, type_size = GGML_QUANT_SIZES[ggml_type] data = data.to(torch.float32).to(device) @@ -75,11 +75,11 @@ def ggml_quant( n_blocks = data.nelement() // block_size split_num = 16 if max(data.shape) > 100_000 else 1 blocks = data.reshape((n_blocks, block_size)) - scale = scale.to(device).reshape(blocks.shape[0],-1) if scale is not None else scale - zp = zp.to(device).reshape(blocks.shape[0],-1) if zp is not None and isinstance(zp, torch.Tensor) else zp - wmin = wmin.to(device).reshape(blocks.shape[0],-1) if wmin is not None else wmin - d_scale = d_scale.to(device).reshape(blocks.shape[0],-1) if d_scale is not None else d_scale - d_wmin = d_wmin.to(device).reshape(blocks.shape[0],-1) if d_wmin is not None else d_wmin + scale = scale.to(device).reshape(blocks.shape[0], -1) if scale is not None else scale + zp = zp.to(device).reshape(blocks.shape[0], -1) if zp is not None and isinstance(zp, torch.Tensor) else zp + wmin = wmin.to(device).reshape(blocks.shape[0], -1) if wmin is not None else wmin + d_scale = d_scale.to(device).reshape(blocks.shape[0], -1) if d_scale is not None else d_scale + d_wmin = d_wmin.to(device).reshape(blocks.shape[0], -1) if d_wmin is not None else d_wmin quant_func = GGML_QUANT_TYPE[ggml_type] results = [] @@ -95,8 +95,9 @@ def ggml_quant( tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin shape = data.shape - new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, - imatrix, original) + new_data = ggml_quant_core( + quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, imatrix, original + ) # imatrix = imatrix[start:end] if imatrix is not None else imatrix else: new_data = ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original) @@ -635,7 +636,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i replace_ids = (max_mins > 0).squeeze() output_scale[replace_ids] |= ( - torch.round(id_mins[replace_ids] * mins[replace_ids]).clip(0, 15).to(torch.uint8) << 4 + torch.round(id_mins[replace_ids] * mins[replace_ids]).clip(0, 15).to(torch.uint8) << 4 ) d_tmp = output_d * (output_scale & 0xF) @@ -736,7 +737,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, @register_qtype("q4_k") def q4_k_quant_block( - blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs + blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) @@ -824,15 +825,15 @@ def q4_k_quant_block( @register_qtype("q5_k") def q5_k_quant_block( - blocks, - scale=None, - zp=None, - wmin=None, - d_scale=None, - d_wmin=None, - imatrix=None, - original=False, - **kwargs, + blocks, + scale=None, + zp=None, + wmin=None, + d_scale=None, + d_wmin=None, + imatrix=None, + original=False, + **kwargs, ): nb = blocks.shape[0] blocks = blocks.reshape((nb, QK_K // 32, 32)) From 9fa4cd9920ccf57df740b4221ec3f574b4f337a4 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 20 Nov 2025 21:44:59 +0800 Subject: [PATCH 45/57] try to fix ut failure --- auto_round/compressors/base.py | 4 +- auto_round/export/export_to_gguf/packing.py | 54 ++++++++++----------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 2ccb22486..06b861ccf 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -713,8 +713,8 @@ def _check_compatibility(self) -> None: raise ValueError("Gguf format is not compatible with other formats, please choose only one of them") if has_gguf and self.iters != 0 and self.bits != 3 and not self.enable_alg_ext: logger.warning( - "`iters=0` is recommended when exporting to GGUF format except for bits 3," - " or add `enable_alg_ext` for better accuracy with much more tuning cost" + "`iters=0` is recommended when exporting to current GGUF format" + " or add `enable_alg_ext` for better accuracy with much more tuning cost." " Please refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md" " to check the acc." ) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 9158f3742..5ca3c8d06 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -42,15 +42,15 @@ def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatri original=original, ) except torch.OutOfMemoryError: - device = "cpu" - blocks = blocks.to(device) - scale = scale.to(device) if scale is not None else scale - zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp - wmin = wmin.to(device) if wmin is not None else wmin - d_scale = d_scale.to(device) if d_scale is not None else d_scale - d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin - imatrix = imatrix.to(device) if imatrix is not None else imatrix - clear_memory(device_list=orig_device) + cpu_device = "cpu" + blocks = blocks.to(cpu_device) + scale = scale.to(cpu_device) if scale is not None else scale + zp = zp.to(cpu_device) if zp is not None and isinstance(zp, torch.Tensor) else zp + wmin = wmin.to(cpu_device) if wmin is not None else wmin + d_scale = d_scale.to(cpu_device) if d_scale is not None else d_scale + d_wmin = d_wmin.to(cpu_device) if d_wmin is not None else d_wmin + imatrix = imatrix.to(cpu_device) if imatrix is not None else imatrix + clear_memory(device_list=device) new_data = quant_func( blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original ) @@ -84,31 +84,31 @@ def ggml_quant( quant_func = GGML_QUANT_TYPE[ggml_type] results = [] chunk_size = (n_blocks + split_num - 1) // split_num - for i in range(split_num): - if split_num > 1: - start = chunk_size * i - end = chunk_size * (i + 1) - tmp_blocks = blocks[start:end] - tmp_scale = scale[start:end] if scale is not None else scale - tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp - tmp_wmin = wmin[start:end] if wmin is not None else wmin - tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale - tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin - shape = data.shape - new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, - imatrix, original) - # imatrix = imatrix[start:end] if imatrix is not None else imatrix + if split_num > 1: + for i in range(split_num): + start = chunk_size * i + end = chunk_size * (i + 1) + tmp_blocks = blocks[start:end] + tmp_scale = scale[start:end] if scale is not None else scale + tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp + tmp_wmin = wmin[start:end] if wmin is not None else wmin + tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale + tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin + new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, + imatrix, original) + results.append(new_data) + if split_num > 1: + clear_memory(device_list=device) else: new_data = ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original) results.append(new_data) - if split_num > 1: - clear_memory(device_list=device) + if len(results) == 1: new_data = results[0] else: new_data = np.concatenate(results, axis=0) - new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) + new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) # Check shape correctness new_data = new_data.reshape(*shape[:-1], -1) return new_data @@ -867,7 +867,7 @@ def q5_k_quant_block( dm_tmp = output_dmin * q_mins replace_ids = d_tmp != 0 all_L[replace_ids] = ( - blocks_[replace_ids] + blocks[replace_ids] .add_(dm_tmp[replace_ids].unsqueeze(-1)) .div_(d_tmp[replace_ids].unsqueeze(-1)) .round_() From 70a3fdb8927e86f6296579130dc4bf51ce4d9dff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:52:15 +0000 Subject: [PATCH 46/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_gguf/packing.py | 30 ++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 93a87fb04..729174a1e 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -86,29 +86,29 @@ def ggml_quant( chunk_size = (n_blocks + split_num - 1) // split_num if split_num > 1: for i in range(split_num): - start = chunk_size * i - end = chunk_size * (i + 1) - tmp_blocks = blocks[start:end] - tmp_scale = scale[start:end] if scale is not None else scale - tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp - tmp_wmin = wmin[start:end] if wmin is not None else wmin - tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale - tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin - new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, - imatrix, original) - results.append(new_data) - if split_num > 1: - clear_memory(device_list=device) + start = chunk_size * i + end = chunk_size * (i + 1) + tmp_blocks = blocks[start:end] + tmp_scale = scale[start:end] if scale is not None else scale + tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp + tmp_wmin = wmin[start:end] if wmin is not None else wmin + tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale + tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin + new_data = ggml_quant_core( + quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, imatrix, original + ) + results.append(new_data) + if split_num > 1: + clear_memory(device_list=device) else: new_data = ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original) results.append(new_data) - if len(results) == 1: new_data = results[0] else: new_data = np.concatenate(results, axis=0) - new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) # Check shape correctness + new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) # Check shape correctness new_data = new_data.reshape(*shape[:-1], -1) return new_data From b035b4f7ecbc15f343290365805e9ac07900f25e Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Thu, 20 Nov 2025 22:07:17 +0800 Subject: [PATCH 47/57] try to fix ut failure --- auto_round/export/export_to_gguf/packing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 729174a1e..aa9e6ba27 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -42,6 +42,7 @@ def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatri original=original, ) except torch.OutOfMemoryError: + orig_device = blocks.device cpu_device = "cpu" blocks = blocks.to(cpu_device) scale = scale.to(cpu_device) if scale is not None else scale @@ -50,7 +51,7 @@ def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatri d_scale = d_scale.to(cpu_device) if d_scale is not None else d_scale d_wmin = d_wmin.to(cpu_device) if d_wmin is not None else d_wmin imatrix = imatrix.to(cpu_device) if imatrix is not None else imatrix - clear_memory(device_list=device) + clear_memory(device_list=orig_device) new_data = quant_func( blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original ) From a7cd959f7c3aa5f5c7e77dc88efef00d88c1af02 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 21 Nov 2025 11:54:43 +0800 Subject: [PATCH 48/57] update --- auto_round/compressors/base.py | 24 ++++++++++------ auto_round/export/export_to_gguf/export.py | 32 +++++++++++++++++++-- auto_round/export/export_to_gguf/packing.py | 1 - 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 06b861ccf..564f219f6 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1229,7 +1229,7 @@ def get_imatrix_hook(module, input, output): for hook in hooks: hook.remove() - def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None: + def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None: """Quantizes a layer using RTN (Round-To-Nearest) if available. This function attempts to quantize a layer by switching its data type to a @@ -1252,14 +1252,14 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None: if is_fp8_linear(m): m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device) set_module(self.model, name, m) - + tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device # Step 1: Try quantization on GPU first, fall back to CPU if OOM if self.immediate_packing and self.iters == 0 and "gguf" in self.formats[0] and not self.disable_opt_rtn: + m = m.to(tuning_device) m.scale = None m.zp = None else: try: - tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device m = m.to(tuning_device) m = WrapperLinear( m, @@ -1271,7 +1271,6 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None: disable_opt_rtn=self.disable_opt_rtn, ) m = m.unwrapper({}) - m.to("cpu") except torch.OutOfMemoryError: cuda_error_msg = traceback.format_exc() m = m.orig_layer if hasattr(m, "orig_layer") else m @@ -1291,11 +1290,14 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None: raise # Step 2: Optional immediate packing/export - if self.immediate_packing: + if self.immediate_packing: # For gguf, packing conducts on block level self._immediate_pack(name) + if to_cpu: + m = m.to("cpu") else: + if to_cpu: + m = m.to("cpu") set_module(self.model, name, m) - if self.immediate_saving: all_to_quantized_module_names = [n for n, m in self.model.named_modules() if check_to_quantized(m)] last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1]) @@ -1303,6 +1305,8 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None: immediate_saving(self, m, name, last_module) def _immediate_pack(self, name: str): + if not self.immediate_packing: + return m = get_module(self.model, name) if not check_to_quantized(m): return @@ -1363,7 +1367,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None ) - # self._quantize_embedding_layer() + self._quantize_embedding_layer() # levea to gguf itself to handle self.model.to("cpu") # Release memory @@ -1515,14 +1519,16 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) set_amax_for_all_moe_layers(block, attr_name="act_max") # Normalize imatrix and quantize layers if self.low_gpu_mem_usage: + block.to("cpu") clear_memory(device_list=self.device_list) + for _, m in block.named_modules(): # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1 if hasattr(m, "imatrix"): m.imatrix /= m.imatrix_cnt if hasattr(m, "tmp_name") and m.tmp_name in all_to_quantized_module_names: - self._quantize_layer_via_rtn(m.tmp_name) + self._quantize_layer_via_rtn(m.tmp_name,to_cpu=False) all_to_quantized_module_names.remove(m.tmp_name) if not self.immediate_saving: mv_module_from_gpu(block) @@ -1641,7 +1647,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: else: logger.info("start to cache block inputs") all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names) - is_quantized_embedding = self._quantize_embedding_layer() + is_quantized_embedding,_ = self._quantize_embedding_layer() clear_memory(device_list=self.device_list) all_q_inputs = None if is_quantized_embedding: diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 140776087..48b103a4e 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -159,13 +159,16 @@ def pack_gguf_layer( model_type=convert_hf_to_gguf.ModelType.MMPROJ, ) ) + if not hasattr(model, "last_layer_name_to_block_name"): block_name_to_last_layer_name = {} block_names = get_block_names(model, quant_vision=True) block_names_flatten = flatten_list(block_names) + all_qlayer_name = [] for n, m in model.named_modules(): if not check_to_quantized(m): continue + all_qlayer_name.append(n) for block_name in block_names_flatten: block_name_split = block_name.split(".") name_split = n.split(".") @@ -177,13 +180,23 @@ def pack_gguf_layer( block_name_to_last_layer_name[block_name] = n last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()} model.last_layer_name_to_block_name = last_layer_name_to_block_name + names_in_blocks=[] + for block_name in block_names_flatten: + block = get_module(model, block_name) + for n,m in block.named_modules(): + if check_to_quantized(m): + names_in_blocks.append(m.tmp_name) + names_outside_blocks = list(set(layer_config.keys()) - set(names_in_blocks)) + model.names_outside_blocks = names_outside_blocks + if name in model.last_layer_name_to_block_name: # Packing block + block = get_module(model, model.last_layer_name_to_block_name[name]) for gguf_model in gguf_model_instance_global: gguf_model.current_packing_block = model.last_layer_name_to_block_name[name] gguf_model.prepare_tensors() - block = get_module(model, model.last_layer_name_to_block_name[name]) + for n, m in block.named_modules(): if hasattr(m, "weight"): m.weight = None @@ -193,6 +206,21 @@ def pack_gguf_layer( if len(model.last_layer_name_to_block_name) == 0: for gguf_model in gguf_model_instance_global: gguf_model.current_packing_block = None + if name in model.names_outside_blocks: + # Packing block + for gguf_model in gguf_model_instance_global: + gguf_model.current_packing_block =name + gguf_model.prepare_tensors() + + layer = get_module(model, name) + if hasattr(layer, "weight"): + layer.weight = None + if hasattr(layer, "bias"): + layer.bias = None + model.names_outside_blocks.remove(name) + if len(model.names_outside_blocks) == 0: + for gguf_model in gguf_model_instance_global: + gguf_model.current_packing_block = None @torch.inference_mode() @@ -219,4 +247,4 @@ def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", layer_config=None, v logger.info(f"Model successfully exported to {gguf_model.fname_out}, running time={rt}") del gguf_model_instance_global - return model + return model \ No newline at end of file diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index aa9e6ba27..9a48cfa0d 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -81,7 +81,6 @@ def ggml_quant( wmin = wmin.to(device).reshape(blocks.shape[0], -1) if wmin is not None else wmin d_scale = d_scale.to(device).reshape(blocks.shape[0], -1) if d_scale is not None else d_scale d_wmin = d_wmin.to(device).reshape(blocks.shape[0], -1) if d_wmin is not None else d_wmin - quant_func = GGML_QUANT_TYPE[ggml_type] results = [] chunk_size = (n_blocks + split_num - 1) // split_num From ae99930ccf1d98b5beb86acc573e0658ca076b22 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Nov 2025 03:55:29 +0000 Subject: [PATCH 49/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 8 ++++---- auto_round/export/export_to_gguf/export.py | 9 ++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 564f219f6..a841e7ea2 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1290,7 +1290,7 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T raise # Step 2: Optional immediate packing/export - if self.immediate_packing: # For gguf, packing conducts on block level + if self.immediate_packing: # For gguf, packing conducts on block level self._immediate_pack(name) if to_cpu: m = m.to("cpu") @@ -1367,7 +1367,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None ) - self._quantize_embedding_layer() # levea to gguf itself to handle + self._quantize_embedding_layer() # levea to gguf itself to handle self.model.to("cpu") # Release memory @@ -1528,7 +1528,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) if hasattr(m, "imatrix"): m.imatrix /= m.imatrix_cnt if hasattr(m, "tmp_name") and m.tmp_name in all_to_quantized_module_names: - self._quantize_layer_via_rtn(m.tmp_name,to_cpu=False) + self._quantize_layer_via_rtn(m.tmp_name, to_cpu=False) all_to_quantized_module_names.remove(m.tmp_name) if not self.immediate_saving: mv_module_from_gpu(block) @@ -1647,7 +1647,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: else: logger.info("start to cache block inputs") all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names) - is_quantized_embedding,_ = self._quantize_embedding_layer() + is_quantized_embedding, _ = self._quantize_embedding_layer() clear_memory(device_list=self.device_list) all_q_inputs = None if is_quantized_embedding: diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 48b103a4e..0a5bfc461 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -180,10 +180,10 @@ def pack_gguf_layer( block_name_to_last_layer_name[block_name] = n last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()} model.last_layer_name_to_block_name = last_layer_name_to_block_name - names_in_blocks=[] + names_in_blocks = [] for block_name in block_names_flatten: block = get_module(model, block_name) - for n,m in block.named_modules(): + for n, m in block.named_modules(): if check_to_quantized(m): names_in_blocks.append(m.tmp_name) names_outside_blocks = list(set(layer_config.keys()) - set(names_in_blocks)) @@ -196,7 +196,6 @@ def pack_gguf_layer( gguf_model.current_packing_block = model.last_layer_name_to_block_name[name] gguf_model.prepare_tensors() - for n, m in block.named_modules(): if hasattr(m, "weight"): m.weight = None @@ -209,7 +208,7 @@ def pack_gguf_layer( if name in model.names_outside_blocks: # Packing block for gguf_model in gguf_model_instance_global: - gguf_model.current_packing_block =name + gguf_model.current_packing_block = name gguf_model.prepare_tensors() layer = get_module(model, name) @@ -247,4 +246,4 @@ def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", layer_config=None, v logger.info(f"Model successfully exported to {gguf_model.fname_out}, running time={rt}") del gguf_model_instance_global - return model \ No newline at end of file + return model From 79b249029966a8c4ae4af74a05435fb8c1251369 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 21 Nov 2025 12:04:47 +0800 Subject: [PATCH 50/57] fix --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 564f219f6..2ccb34427 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1367,7 +1367,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None ) - self._quantize_embedding_layer() # levea to gguf itself to handle + # self._quantize_embedding_layer() # leave to gguf itself to handle self.model.to("cpu") # Release memory From 29b51885bf815d343d92c2ef03694cde78e366aa Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 21 Nov 2025 12:08:24 +0800 Subject: [PATCH 51/57] update Signed-off-by: Wenhua Cheng --- auto_round/compressors/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 2ccb34427..54561e464 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1363,11 +1363,9 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: for module in tqdm(modules, desc="Update weight global scale for fuse module"): update_fused_layer_global_scales(module) - has_gguf_k = ( - any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None - ) - # self._quantize_embedding_layer() # leave to gguf itself to handle + if not (any("gguf" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None): + self._quantize_embedding_layer() # leave to gguf itself to handle self.model.to("cpu") # Release memory @@ -1375,6 +1373,10 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: enable_imatrix = False if not self.disable_opt_rtn: + has_gguf_k = ( + any("gguf" in fmt and "k" in fmt for fmt in + getattr(self, "formats", [])) or self.super_bits is not None + ) if has_gguf_k: enable_imatrix = True elif self.data_type == "int" and self.sym: From 5e0bb6c590576ef9c78be8103b1ca8fddea59609 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Nov 2025 04:10:42 +0000 Subject: [PATCH 52/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 63c36b726..b0dbcc381 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1364,7 +1364,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: update_fused_layer_global_scales(module) if not (any("gguf" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None): - self._quantize_embedding_layer() # leave to gguf itself to handle + self._quantize_embedding_layer() # leave to gguf itself to handle self.model.to("cpu") # Release memory @@ -1373,8 +1373,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: enable_imatrix = False if not self.disable_opt_rtn: has_gguf_k = ( - any("gguf" in fmt and "k" in fmt for fmt in - getattr(self, "formats", [])) or self.super_bits is not None + any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None ) if has_gguf_k: enable_imatrix = True From d6d2979589863b209df8ead4373c5fcc88ad3c15 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 21 Nov 2025 13:08:13 +0800 Subject: [PATCH 53/57] fix typo --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 63c36b726..951ee9e66 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1648,7 +1648,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: else: logger.info("start to cache block inputs") all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names) - is_quantized_embedding, _ = self._quantize_embedding_layer() + is_quantized_embedding = self._quantize_embedding_layer() clear_memory(device_list=self.device_list) all_q_inputs = None if is_quantized_embedding: From 1c8fe02d3910a55f7e364e7fab8695f84affd006 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 23 Nov 2025 22:27:49 -0500 Subject: [PATCH 54/57] fix bug of gguf mllm Signed-off-by: n1ck-guo --- auto_round/export/export_to_gguf/convert.py | 16 +++---------- auto_round/utils/model.py | 25 +++++++++++++++++++++ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 572e4bb6f..5a00c49ed 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -50,7 +50,7 @@ from auto_round.export.export_to_gguf.config import ModelType from auto_round.export.export_to_gguf.packing import ggml_quant -from auto_round.utils import LazyImport, clear_memory, get_module, get_packing_device, is_fp8_model, logger +from auto_round.utils import LazyImport, clear_memory, get_module, get_packing_device, is_fp8_model, logger, clean_module_parameter gguf = LazyImport("gguf") @@ -58,17 +58,6 @@ from torch import Tensor -def clean_module_parameter(submodule, parameter): - if submodule is None: - return - is_buffer = parameter in submodule._buffers - with torch.no_grad(): - if is_buffer: - submodule._buffers[parameter] = None - else: - submodule._parameters[parameter] = None - - def download_convert_file(redownload=False): CONVERT_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/convert_hf_to_gguf.py" FILE_NAME = "convert_hf_to_gguf.py" @@ -375,7 +364,8 @@ def prepare_tensors(cls): max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in chain(cls.generate_extra_tensors(), cls.get_tensors()): - if data_torch is None: + + if data_torch is None or data_torch.numel() == 0: continue # we don't need these if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index ff9b3b57c..34744274d 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -28,6 +28,31 @@ from auto_round.logger import logger from auto_round.schemes import QuantizationScheme +def clean_module_parameter(submodule: torch.nn.Module, param_name: str) -> None: + """This function is recommended to be used instead of module.weight = None. + For models like `tie_word_embeddings`, setting the embedding weight to None + causes `lm_head` to reallocate memory for its weight instead of treating it as a "bound shared weight," + it's now iterated over as an independent parameter, + resulting in an additional `lm_head` parameter in `named_parameters`. + + Args: + submodule (torch.nn.Module): submodule to clean + param_name (str): "weight" or "bias" + """ + if submodule is None: + return + is_buffer = param_name in submodule._buffers + with torch.no_grad(): + if is_buffer: + buf = submodule._buffers[param_name] + if buf is not None: + buf.data = torch.empty(0, dtype=buf.dtype, device=buf.device) + buf.requires_grad = False + else: + param = submodule._parameters[param_name] + if param is not None: + param.data = torch.empty(0, dtype=param.dtype, device=param.device) + param.requires_grad = False def convert_dtype_str2torch(str_dtype): """Converts a string dtype to its corresponding PyTorch dtype. From 0c254d7092d23779d1308f9b46bca98f2d06a958 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 03:27:29 +0000 Subject: [PATCH 55/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_gguf/convert.py | 12 ++++++++++-- auto_round/utils/model.py | 8 +++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 5a00c49ed..5f42535d3 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -50,7 +50,15 @@ from auto_round.export.export_to_gguf.config import ModelType from auto_round.export.export_to_gguf.packing import ggml_quant -from auto_round.utils import LazyImport, clear_memory, get_module, get_packing_device, is_fp8_model, logger, clean_module_parameter +from auto_round.utils import ( + LazyImport, + clean_module_parameter, + clear_memory, + get_module, + get_packing_device, + is_fp8_model, + logger, +) gguf = LazyImport("gguf") @@ -364,7 +372,7 @@ def prepare_tensors(cls): max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in chain(cls.generate_extra_tensors(), cls.get_tensors()): - + if data_torch is None or data_torch.numel() == 0: continue # we don't need these diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 34744274d..1c2fc7987 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -28,11 +28,12 @@ from auto_round.logger import logger from auto_round.schemes import QuantizationScheme + def clean_module_parameter(submodule: torch.nn.Module, param_name: str) -> None: """This function is recommended to be used instead of module.weight = None. - For models like `tie_word_embeddings`, setting the embedding weight to None - causes `lm_head` to reallocate memory for its weight instead of treating it as a "bound shared weight," - it's now iterated over as an independent parameter, + For models like `tie_word_embeddings`, setting the embedding weight to None + causes `lm_head` to reallocate memory for its weight instead of treating it as a "bound shared weight," + it's now iterated over as an independent parameter, resulting in an additional `lm_head` parameter in `named_parameters`. Args: @@ -54,6 +55,7 @@ def clean_module_parameter(submodule: torch.nn.Module, param_name: str) -> None: param.data = torch.empty(0, dtype=param.dtype, device=param.device) param.requires_grad = False + def convert_dtype_str2torch(str_dtype): """Converts a string dtype to its corresponding PyTorch dtype. From f85fe7e18a7bca7c48a90c4af70bd2f9bc303c0e Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 24 Nov 2025 11:42:48 +0800 Subject: [PATCH 56/57] refine a little --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 452b5a21c..62c2f6a32 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -716,7 +716,7 @@ def _check_compatibility(self) -> None: "`iters=0` is recommended when exporting to current GGUF format" " or add `enable_alg_ext` for better accuracy with much more tuning cost." " Please refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md" - " to check the acc." + " for the accuracy results." ) if ( From 94085dc7b702403b9b25b455561a58e1044d3cfb Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 24 Nov 2025 13:27:36 +0800 Subject: [PATCH 57/57] refine a little --- auto_round/utils/device.py | 4 ++++ docs/step_by_step.md | 1 + 2 files changed, 5 insertions(+) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 1db1fce69..12c904b3e 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -1383,8 +1383,12 @@ def update(self, device_list=None): continue if torch.cuda.is_available(): current_vram = torch.cuda.memory_reserved(device) / 1024**3 # GB + if device == "cuda": + device = "0" elif torch.xpu.is_available(): current_vram = torch.xpu.memory_reserved(device) / 1024**3 # GB + if device == "xpu": + device = "0" else: return diff --git a/docs/step_by_step.md b/docs/step_by_step.md index 29924bc79..e8c1dca8d 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -559,6 +559,7 @@ autoround.save_quantized(format="auto_awq", output_dir="tmp_autoround") - **Reduced CPU Memory Usage :** + - Enable low_cpu_mem_usage (experimental): Only one export format is supported. The quantized model is saved immediately after each block is packed, reducing peak CPU memory usage. - Trigger immediate packing: Packing will be triggered immediately when using the command-line interface or the quantize_and_save API, as long as only one export format is specified.