From 8ceb6a1d59209f62a96308cafcdf9e53568c5855 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 12:03:51 +0800
Subject: [PATCH 01/57] reduce vram

---
 auto_round/compressors/base.py | 14 +++++++++++---
 auto_round/data_type/gguf.py   | 12 ++++++++----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 7e9288f8d..b6ea8d40a 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1050,7 +1050,7 @@ def _get_save_folder_name(self, format_str: str) -> str:
 
         return self.orig_output_dir
 
-    @torch.inference_mode()
+    # @torch.inference_mode()
     def _quantize_embedding_layer(self):
         """Quantizes embedding layers in the model according to the configuration.
 
@@ -1122,9 +1122,12 @@ def _quantize_embedding_layer(self):
 
             # Update config
             self.layer_config.setdefault(name, {}).update(config)
+            del weight
+            del scale
+            del zp
+            clear_memory(self.device_list)
+
 
-            # Release memory
-            clear_memory(device_list=self.device_list)
 
         return is_quantized
 
@@ -1354,10 +1357,14 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         has_gguf_k = (
             any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None
         )
+        if has_gguf_k:
+            self.model.to(torch.float32)
 
         self._quantize_embedding_layer()
 
         self.model.to("cpu")
+        # Release memory
+        clear_memory(device_list=self.device_list)
 
         enable_imatrix = False
         if not self.disable_opt_rtn:
@@ -1628,6 +1635,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             logger.info("start to cache block inputs")
         all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names)
         is_quantized_embedding = self._quantize_embedding_layer()
+        clear_memory(device_list=self.device_list)
         all_q_inputs = None
         if is_quantized_embedding:
             all_inputs = copy.deepcopy(self.inputs)
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 577ccf34e..1bf920a9c 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -408,6 +408,8 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri
         d_wmin = d_wmin.unsqueeze(-1)
         scale = (d_scale * q_scale).view(-1, 1)
         wmin = (d_wmin * q_wmin).view(-1, 1)
+    if split_num > 1:
+        clear_memory([tensor.device])
     return scale, wmin, d_scale, d_wmin
 
 
@@ -455,10 +457,12 @@ def quant_tensor_gguf_asym_dq(
         )
 
     inverse_scale = get_reciprocal(scale)
-    int_w = torch.clamp(round_ste((tensor + wmin) * inverse_scale + v), 0, maxq)
-    qdq_result = (scale * int_w - wmin).to(orig_dtype)
-    qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
-    return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
+    tensor = tensor.add_(wmin)
+    tensor = torch.round(tensor.mul_(inverse_scale)).clamp_(0,maxq)
+    tensor = tensor.mul_(scale)
+    tensor = tensor.subtract_(wmin).to(orig_dtype)
+    tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len)
+    return tensor, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
 
 
 def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None):

From 97f460e4a57db5c70bedc17ef6ff11625794e87f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Nov 2025 04:05:11 +0000
Subject: [PATCH 02/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 2 --
 auto_round/data_type/gguf.py   | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index b6ea8d40a..544aea07c 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1127,8 +1127,6 @@ def _quantize_embedding_layer(self):
             del zp
             clear_memory(self.device_list)
 
-
-
         return is_quantized
 
     def _quant_rtn_with_imatrix(self, all_to_quantized_module_names: list[str]) -> None:
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 1bf920a9c..21c5a1e0d 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -458,7 +458,7 @@ def quant_tensor_gguf_asym_dq(
 
     inverse_scale = get_reciprocal(scale)
     tensor = tensor.add_(wmin)
-    tensor = torch.round(tensor.mul_(inverse_scale)).clamp_(0,maxq)
+    tensor = torch.round(tensor.mul_(inverse_scale)).clamp_(0, maxq)
     tensor = tensor.mul_(scale)
     tensor = tensor.subtract_(wmin).to(orig_dtype)
     tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len)

From dd27f91a0ef512cc9c80322f7b89c7af3795b54e Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 14:46:48 +0800
Subject: [PATCH 03/57] update

---
 auto_round/compressors/base.py              | 39 +++++----
 auto_round/data_type/gguf.py                | 73 ++++++++++++----
 auto_round/export/export_to_gguf/packing.py | 97 ++++++++++++++++++++-
 auto_round/utils/common.py                  | 29 ++++--
 auto_round/wrapper.py                       |  4 +-
 5 files changed, 196 insertions(+), 46 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 544aea07c..3cd49771b 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1050,7 +1050,7 @@ def _get_save_folder_name(self, format_str: str) -> str:
 
         return self.orig_output_dir
 
-    # @torch.inference_mode()
+    @torch.inference_mode()
     def _quantize_embedding_layer(self):
         """Quantizes embedding layers in the model according to the configuration.
 
@@ -1085,11 +1085,15 @@ def _quantize_embedding_layer(self):
                 dtype = f"rtn_{dtype}"
 
             quant_func = QUANT_FUNC_WITH_DTYPE[dtype]
+            dtype = module.weight.dtype
+            # As typically float32 are used in RTN to search scale zp, to avoid cache a bf16 copy we'd better use float32
+            if config["super_group_size"] is not None:
+                dtype = torch.float32
 
             # Attempt quantization on GPU, fall back to CPU if OOM
             try:
                 weight, scale, zp = quant_func(
-                    module.weight.to(self.device),
+                    module.weight.to(dtype).to(self.device), #
                     **{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]},
                 )
             except torch.OutOfMemoryError:
@@ -1223,7 +1227,7 @@ def get_imatrix_hook(module, input, output):
             for hook in hooks:
                 hook.remove()
 
-    def _quantize_layer_via_rtn(self, name: str) -> None:
+    def _quantize_layer_via_rtn(self, name: str, dtype:torch.dtype=None) -> None:
         """Quantizes a layer using RTN (Round-To-Nearest) if available.
 
         This function attempts to quantize a layer by switching its data type to a
@@ -1240,13 +1244,14 @@ def _quantize_layer_via_rtn(self, name: str) -> None:
             RuntimeError: If quantization fails for reasons unrelated to memory.
         """
         m = get_module(self.model, name)
+        if dtype is not None:
+            m = m.to(dtype)
 
         if is_fp8_linear(m):
             m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device)
             set_module(self.model, name, m)
 
         # Step 1: Try quantization on GPU first, fall back to CPU if OOM
-        # if only export gguf, using gguf-packing instead of rtn
         if self.immediate_packing and self.iters == 0 and "gguf" in self.formats[0] and not self.disable_opt_rtn:
             m.scale = None
             m.zp = None
@@ -1355,8 +1360,6 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         has_gguf_k = (
             any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None
         )
-        if has_gguf_k:
-            self.model.to(torch.float32)
 
         self._quantize_embedding_layer()
 
@@ -1471,6 +1474,12 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                     input_others[key] = val.to(tmp_dtype)
                 elif isinstance(val, list):
                     input_others[key] = [to_dtype(v, tmp_dtype) for v in val]
+            # for name in ["lm_head"]:
+            #     dtype = None
+            #     if self.super_group_size is not None:
+            #         dtype = torch.float32
+            #     self._quantize_layer_via_rtn(name, dtype=dtype)
+            #     clear_memory(device_list=self.device_list)
 
             for block_name in block_names:
                 pbar.set_description(f"Quantizing {block_name}")
@@ -1501,6 +1510,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                     self.device,
                     self.cache_device,
                 )
+
                 if len(self.device_list) > 1:
                     accelerate.hooks.remove_hook_from_submodules(block)
 
@@ -1508,6 +1518,8 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                     # enable moe experts act_max automatic generation for Linear
                     set_amax_for_all_moe_layers(block, attr_name="act_max")
                 # Normalize imatrix and quantize layers
+                if self.low_gpu_mem_usage:
+                    clear_memory(device_list=self.device_list)
                 for _, m in block.named_modules():
                     # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu
                     # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1
@@ -1521,18 +1533,13 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                 pbar.update(1)
 
         pbar.close()
-        cnt = 1
-        block_names_cnt = len(flatten_list(get_block_names(self.model, True)))
-        clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt
-        if clear_mem_freq == 0:
-            clear_mem_freq = 1
         # Process remaining layers not in blocks
         for name in all_to_quantized_module_names:
-            self._quantize_layer_via_rtn(name)
-            if cnt % clear_mem_freq == 0:
-                clear_memory(device_list=self.device_list)
-                cnt = 1
-            cnt += 1
+            dtype=None
+            if self.super_group_size is not None:
+                dtype=torch.float32
+            self._quantize_layer_via_rtn(name, dtype=dtype)
+            clear_memory(device_list=self.device_list)
 
     def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]:
         keys = inputs.keys()
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 21c5a1e0d..056f74937 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -18,7 +18,7 @@
 from auto_round.data_type.register import register_dtype
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES
-from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants
+from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants, make_qx_quants_chunk
 from auto_round.logger import logger
 from auto_round.utils import get_reciprocal
 from auto_round.utils.device import clear_memory
@@ -165,6 +165,37 @@ def double_quant_tensor_sym(tensor, bits):
     return qdq_tensor, scale
 
 
+def double_quant_tensor_sym_rtn(tensor, bits):
+    """
+    Inplace-optimized symmetric double quantization.
+    - Uses float32 inplace where possible
+    - Minimizes temporary tensor allocations
+    """
+    # Ensure tensor is float32 inplace (if tensor already float32, no copy)
+    if tensor.dtype != torch.float32:
+        tensor = tensor.float()  # .float() creates a copy if needed
+
+    maxq = 2 ** (bits - 1)
+
+    # Compute absolute max along last dim
+    # abs_() is inplace
+    tensor_abs = tensor.abs()  # cannot inplace abs on original if we need original sign
+    imax = tensor_abs.argmax(dim=-1, keepdim=True)
+    wmax = torch.take_along_dim(tensor, imax, dim=-1)
+
+    # Compute scale inplace
+    scale = wmax / -maxq
+    inverse_scale = get_reciprocal(scale)
+
+    # Inplace quantization
+    qdq_tensor = tensor.mul_(inverse_scale)        # tensor * inverse_scale inplace
+    qdq_tensor = torch.round(qdq_tensor)           # round inplace
+    qdq_tensor.clamp_(-maxq, maxq - 1)             # clamp inplace
+    qdq_tensor.mul_(scale)                          # multiply scale inplace
+
+    return qdq_tensor, scale
+
+
 def make_qp_quants(nmax, data, quant_weights):
     data = data.to(torch.float32)
     quant_weights = quant_weights.to(torch.float32)
@@ -324,7 +355,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens
 def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None, split_num=1):
     super_bits = 4 if bits == 2 else 6
     super_group_size = 16 if bits == 2 else 8
-    group_size = 16 if bits == 2 else 32
+
     if bits not in [2, 4, 5]:
         raise ValueError(f"bits={bits} not supported by rtn_int_asym_dq")
     quant_weights = None
@@ -409,7 +440,7 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri
         scale = (d_scale * q_scale).view(-1, 1)
         wmin = (d_wmin * q_wmin).view(-1, 1)
     if split_num > 1:
-        clear_memory([tensor.device])
+        clear_memory(device_list=[tensor.device])
     return scale, wmin, d_scale, d_wmin
 
 
@@ -458,9 +489,9 @@ def quant_tensor_gguf_asym_dq(
 
     inverse_scale = get_reciprocal(scale)
     tensor = tensor.add_(wmin)
-    tensor = torch.round(tensor.mul_(inverse_scale)).clamp_(0, maxq)
+    tensor = (tensor.mul_(inverse_scale)).round_().clamp_(0, maxq)
     tensor = tensor.mul_(scale)
-    tensor = tensor.subtract_(wmin).to(orig_dtype)
+    tensor = tensor.sub_(wmin).to(orig_dtype)
     tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len)
     return tensor, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
 
@@ -640,17 +671,13 @@ def iterative_wls_quant_search(
 
 
 @torch.no_grad()
-def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype):
-    from auto_round.export.export_to_gguf.config import K_SCALE_SIZE, QK_K
-
-    group_size = 16
-
+def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num):
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         if bits == 3:
             scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True)
             ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None)
         elif bits == 6:
-            scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None)
+            scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None,split_num=split_num)
     else:
         imatrix = imatrix.to(tensor.device)
         weights = imatrix.reshape(1, -1)
@@ -659,7 +686,7 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype):
 
         quant_weights = _imatrix_handle_zero(quant_weights, tensor, bits)
 
-        scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=quant_weights)
+        scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights,split_num=split_num)
     return scale
 
 
@@ -697,6 +724,12 @@ def quant_tensor_gguf_sym_dq(
 
     maxq = 2 ** (bits - 1)
     group_size = 16
+    split_num=1
+    for dim in tensor.shape:
+        if dim > 100_000:
+            split_num = 16
+            break
+
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
     orig_dtype = tensor.dtype
     super_bits = 6 if bits == 3 else 8
@@ -708,18 +741,20 @@ def quant_tensor_gguf_sym_dq(
     # (nb, 16, 16)
     tensor = tensor.reshape(n_blocks, super_group_size, QK_K // super_group_size)
     if scale is None and d_scale is None:
-        scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype)
+        scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num=split_num)
 
     scale = scale.to(scale_dtype)
     scale = torch.where(torch.abs(scale) < 1e-30, torch.zeros_like(scale), scale)
     # conduct double quant
-    scale, d_scale = double_quant_tensor_sym(scale, super_bits)
+    scale, d_scale = double_quant_tensor_sym_rtn(scale, super_bits)
 
     scale = scale.unsqueeze(-1)
-    zp = torch.full_like(scale, maxq)  # pylint: disable=E1130
+    # zp = torch.full_like(scale, maxq)  # pylint: disable=E1130
     inverse_scale = get_reciprocal(scale)
-    int_w = round_ste(tensor * inverse_scale).clip(-maxq, maxq - 1) + maxq
-    qdq_result = (scale * (int_w - zp)).to(orig_dtype)
-    qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
+    # int_w = round_ste(tensor * inverse_scale).clip(-maxq, maxq - 1) + maxq
+    # qdq_result = (scale * (int_w - zp)).to(orig_dtype)
+    tensor = tensor.mul_(inverse_scale).round_().clamp_(-maxq, maxq - 1)
+    tensor = tensor.mul_(scale).to(orig_dtype)
+    tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len)
 
-    return qdq_result, {"scale": scale, "d_scale": d_scale}, zp
+    return tensor, {"scale": scale, "d_scale": d_scale}, maxq
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index bc9189b7b..e5e03b7f5 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -85,6 +85,99 @@ def torch_roundf(n):
     return torch.sign(n) * b
 
 
+def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1):
+    """
+    Extreme VRAM-optimized version of quantization.
+
+    - Processes data in chunks along the batch dimension (dim=0) to reduce peak memory usage.
+    - Uses inplace operations to avoid unnecessary tensor copies.
+    - Reuses buffers for temporary calculations wherever possible.
+    """
+    nmax = 2 ** (bits - 1)
+    scales_list = []
+    L_list = []
+    chunk_size = (data.shape[0]+split_num-1)//split_num
+    for start in range(0, data.shape[0], chunk_size):
+        end = min(start + chunk_size, data.shape[0])
+        chunk = data[start:end]  # Slice a batch chunk to reduce memory footprint
+
+        # Compute absolute values inplace to avoid extra tensor allocation
+        chunk_abs = chunk.abs()
+        imax = chunk_abs.argmax(dim=-1, keepdim=True)
+        group_max = torch.take_along_dim(chunk, imax, dim=-1)
+
+        # Compute scale factors (inverse max) without extra tensor
+
+        iscales = -nmax *get_reciprocal(group_max)
+
+        # L buffer stores quantized values, modified inplace to save memory
+        L = (chunk * iscales).round_().clamp_(-nmax, nmax - 1)
+
+        # Simple case: rmse_type == 0
+        if rmse_type == 0:
+            L.add_(nmax)  # Shift to unsigned representation inplace
+            scales = (1 / iscales).reshape(iscales.shape[:2])
+            scales_list.append(scales)
+            L_list.append(L.to(torch.uint8))
+            continue
+
+        return_early = False
+        if rmse_type < 0:
+            rmse_type = -rmse_type
+            return_early = True
+
+        # Compute weighting tensor w based on rmse_type
+        if qw is not None:
+            w = qw
+        elif rmse_type == 1:
+            w = chunk * chunk
+        elif rmse_type == 2:
+            w = torch.ones_like(chunk)
+        elif rmse_type == 3:
+            w = chunk.abs()
+        else:
+            w = chunk.abs().sqrt()
+
+        # Compute sumlx and suml2 using the pre-allocated L buffer
+        sumlx = (w * chunk * L).sum(dim=-1)
+        suml2 = (w * L * L).sum(dim=-1)
+        scales = sumlx / suml2
+
+        if return_early:
+            iscales_inv = (1 / iscales).reshape(iscales.shape[:2])
+            # Mix the current scale with inverse scale if suml2 > 0
+            scales = torch.where(suml2 > 0, 0.5 * (scales + iscales_inv), iscales_inv)
+            L.add_(nmax)
+            scales_list.append(scales)
+            L_list.append(L.to(torch.uint8))
+            continue
+
+        # Iteratively refine scales and quantized values
+        best = scales * sumlx
+        for _is in range(-9, 10):
+            if _is == 0:
+                continue
+            iscales_tmp = -(nmax + -0.1 * _is) / group_max
+            # Use a temporary L buffer to avoid creating new large tensor
+            L_tmp = (chunk * iscales_tmp).round_().clamp_(-nmax, nmax - 1)
+            sumlx_tmp = (w * chunk * L_tmp).sum(dim=-1)
+            suml2_tmp = (w * L_tmp * L_tmp).sum(dim=-1)
+            # Determine which elements should be replaced
+            replace_id = (suml2_tmp > 0) & (sumlx_tmp * sumlx_tmp > best * suml2_tmp)
+            # Inplace update of L and scales
+            L[replace_id] = L_tmp[replace_id]
+            scales[replace_id] = sumlx_tmp[replace_id] / suml2_tmp[replace_id]
+            best[replace_id] = scales[replace_id] * sumlx_tmp[replace_id]
+
+        L.add_(nmax)  # Final shift to unsigned
+        scales_list.append(scales)
+        L_list.append(L.to(torch.uint8))
+
+    # Concatenate all chunks along batch dimension
+    scales = torch.cat(scales_list, dim=0)
+    L = torch.cat(L_list, dim=0)
+    return scales, L
+
 def make_qx_quants(data, bits, rmse_type=0, qw=None):
     """
     adapted from llmacpp
@@ -248,10 +341,6 @@ def make_qkx2_quants(data, bits, weights=None, rmin=-1.0, rdelta=0.1, nstep=20,
     return scale.reshape(scale.shape[:2]), L, the_mins.reshape(the_mins.shape[:2])
 
 
-def make_qkx3_quants(data, bits, weights, rmin=-1.0, rdelta=0.1, nstep=20, use_mad=False):
-    return make_qkx2_quants(data, bits, weights, rmin=rmin, rdelta=rdelta, nstep=nstep, use_mad=use_mad)
-
-
 def make_qp_quants(nmax, data, quant_weights):
     group_max = torch.max(data, dim=-1, keepdim=True)[0]
     scale = group_max / nmax
diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
index 6b1717e7b..f16bda005 100644
--- a/auto_round/utils/common.py
+++ b/auto_round/utils/common.py
@@ -307,12 +307,31 @@ def json_serialize(obj: Any):
     raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
 
 
+
+
+
 def get_reciprocal(tensor):
-    if torch.dtype is torch.float16:
-        tensor = torch.sign(tensor) * torch.clamp(torch.abs(tensor), min=1e-5)
-    else:
-        tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor)
-    return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor))
+    """
+    Memory-frugal reciprocal:
+    - Inplace operations on original tensor
+    - Only allocates small boolean mask
+    """
+    eps = 1e-5 if tensor.dtype == torch.float16 else 1e-30
+
+    # Create mask for very small elements (small overhead)
+    mask = tensor.abs() < eps
+
+    # Prepare output in place: reuse tensor if allowed, otherwise create once
+    recip = torch.empty_like(tensor)
+
+    # Safe reciprocal: for nonzero elements
+    nonzero_mask = ~mask
+    recip[nonzero_mask] = 1.0 / tensor[nonzero_mask]
+
+    # Zero out elements below threshold
+    recip[mask] = 0.0
+
+    return recip
 
 
 def normalize_input(decoding_layer_inputs: list[tuple[Any]]) -> Tuple[List[torch.Tensor], Dict[str, Any]]:
diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py
index 4d2c7d5fd..4b599d3a5 100644
--- a/auto_round/wrapper.py
+++ b/auto_round/wrapper.py
@@ -319,7 +319,7 @@ def unwrapper(self, best_params):
 
         if self.orig_layer.weight.device.type == "meta":
             self.orig_layer.to(self.device)
-        ##unwrapper weight
+        # Unwrapper weight
         qdq_weight, scale, zp = self._qdq_weight(v, min_scale, max_scale)
         # if hasattr(self.orig_layer, "imatrix"):
         #     self.orig_layer.imatrix = None
@@ -380,7 +380,7 @@ def _set_dict_attr(attr_dict, attr_name):
             self.orig_layer.update()
             self.orig_layer.to("meta")
 
-        ##unwrapper act
+        # Unwrapper act
         if self.enable_act_quant:
             if not self.orig_layer.act_dynamic:
                 act_max_scale = best_params.get("act_max_scale", torch.tensor(1.0)).to(self.device)

From 0ea4fa230bf72e79805bc9636b6b0727f899cc18 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Nov 2025 06:47:43 +0000
Subject: [PATCH 04/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py              |  8 ++++----
 auto_round/data_type/gguf.py                | 18 +++++++++---------
 auto_round/export/export_to_gguf/packing.py |  5 +++--
 auto_round/utils/common.py                  |  3 ---
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 3cd49771b..907da4525 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1093,7 +1093,7 @@ def _quantize_embedding_layer(self):
             # Attempt quantization on GPU, fall back to CPU if OOM
             try:
                 weight, scale, zp = quant_func(
-                    module.weight.to(dtype).to(self.device), #
+                    module.weight.to(dtype).to(self.device),  #
                     **{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]},
                 )
             except torch.OutOfMemoryError:
@@ -1227,7 +1227,7 @@ def get_imatrix_hook(module, input, output):
             for hook in hooks:
                 hook.remove()
 
-    def _quantize_layer_via_rtn(self, name: str, dtype:torch.dtype=None) -> None:
+    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None:
         """Quantizes a layer using RTN (Round-To-Nearest) if available.
 
         This function attempts to quantize a layer by switching its data type to a
@@ -1535,9 +1535,9 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
         pbar.close()
         # Process remaining layers not in blocks
         for name in all_to_quantized_module_names:
-            dtype=None
+            dtype = None
             if self.super_group_size is not None:
-                dtype=torch.float32
+                dtype = torch.float32
             self._quantize_layer_via_rtn(name, dtype=dtype)
             clear_memory(device_list=self.device_list)
 
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 056f74937..8ffe4ee58 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -188,10 +188,10 @@ def double_quant_tensor_sym_rtn(tensor, bits):
     inverse_scale = get_reciprocal(scale)
 
     # Inplace quantization
-    qdq_tensor = tensor.mul_(inverse_scale)        # tensor * inverse_scale inplace
-    qdq_tensor = torch.round(qdq_tensor)           # round inplace
-    qdq_tensor.clamp_(-maxq, maxq - 1)             # clamp inplace
-    qdq_tensor.mul_(scale)                          # multiply scale inplace
+    qdq_tensor = tensor.mul_(inverse_scale)  # tensor * inverse_scale inplace
+    qdq_tensor = torch.round(qdq_tensor)  # round inplace
+    qdq_tensor.clamp_(-maxq, maxq - 1)  # clamp inplace
+    qdq_tensor.mul_(scale)  # multiply scale inplace
 
     return qdq_tensor, scale
 
@@ -671,13 +671,13 @@ def iterative_wls_quant_search(
 
 
 @torch.no_grad()
-def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num):
+def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         if bits == 3:
             scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True)
             ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None)
         elif bits == 6:
-            scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None,split_num=split_num)
+            scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num)
     else:
         imatrix = imatrix.to(tensor.device)
         weights = imatrix.reshape(1, -1)
@@ -686,7 +686,7 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num):
 
         quant_weights = _imatrix_handle_zero(quant_weights, tensor, bits)
 
-        scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights,split_num=split_num)
+        scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights, split_num=split_num)
     return scale
 
 
@@ -724,7 +724,7 @@ def quant_tensor_gguf_sym_dq(
 
     maxq = 2 ** (bits - 1)
     group_size = 16
-    split_num=1
+    split_num = 1
     for dim in tensor.shape:
         if dim > 100_000:
             split_num = 16
@@ -741,7 +741,7 @@ def quant_tensor_gguf_sym_dq(
     # (nb, 16, 16)
     tensor = tensor.reshape(n_blocks, super_group_size, QK_K // super_group_size)
     if scale is None and d_scale is None:
-        scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype,split_num=split_num)
+        scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num=split_num)
 
     scale = scale.to(scale_dtype)
     scale = torch.where(torch.abs(scale) < 1e-30, torch.zeros_like(scale), scale)
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index e5e03b7f5..81b549a4c 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -96,7 +96,7 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1):
     nmax = 2 ** (bits - 1)
     scales_list = []
     L_list = []
-    chunk_size = (data.shape[0]+split_num-1)//split_num
+    chunk_size = (data.shape[0] + split_num - 1) // split_num
     for start in range(0, data.shape[0], chunk_size):
         end = min(start + chunk_size, data.shape[0])
         chunk = data[start:end]  # Slice a batch chunk to reduce memory footprint
@@ -108,7 +108,7 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1):
 
         # Compute scale factors (inverse max) without extra tensor
 
-        iscales = -nmax *get_reciprocal(group_max)
+        iscales = -nmax * get_reciprocal(group_max)
 
         # L buffer stores quantized values, modified inplace to save memory
         L = (chunk * iscales).round_().clamp_(-nmax, nmax - 1)
@@ -178,6 +178,7 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1):
     L = torch.cat(L_list, dim=0)
     return scales, L
 
+
 def make_qx_quants(data, bits, rmse_type=0, qw=None):
     """
     adapted from llmacpp
diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
index f16bda005..3241f0cb1 100644
--- a/auto_round/utils/common.py
+++ b/auto_round/utils/common.py
@@ -307,9 +307,6 @@ def json_serialize(obj: Any):
     raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
 
 
-
-
-
 def get_reciprocal(tensor):
     """
     Memory-frugal reciprocal:

From d40de66e76a4293bc18e7d310f56301498502dac Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 16:52:47 +0800
Subject: [PATCH 05/57] update

---
 README.md                                   |  7 ++--
 auto_round/compressors/base.py              |  3 +-
 auto_round/export/export_to_gguf/convert.py |  4 ++-
 auto_round/export/export_to_gguf/export.py  |  3 +-
 auto_round/export/export_to_gguf/packing.py | 37 +++++++++++++--------
 5 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index d2ec512a2..367a08c7c 100644
--- a/README.md
+++ b/README.md
@@ -191,7 +191,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
 - **`layer_config` (dict)**: Configuration for weight quantization (default is `None`), mainly for mixed schemes.
 
 ##### Algorithm Settings
-- **`enable_alg_ext` (bool)**: [Experimental Feature] Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
+- **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0` Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
 - **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled).
 
 ##### Tuning Process Parameters
@@ -212,7 +212,8 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
 
 </details>
 
-### AutoScheme Usage 
+### Adaptive Bits/Dtype Usage 
+AutoScheme provide automatically algorithm to provide mixed bits/data_type quantization recipes. For some accuracy result, please refer to this [doc](https://github.com/intel/auto-round/blob/main/docs/auto_scheme_acc.md). 
 Please refer to the [user guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#autoscheme) for more details on AutoScheme.
 ~~~python
 from auto_round import AutoRound, AutoScheme
@@ -294,7 +295,7 @@ for output in outputs:
 
 
 ### SGLang (Intel GPU/CUDA)
-Please note that support for the MoE models and visual language models is currently limited.
+**Please note that support for the MoE models and visual language models is currently limited.**
 
 ```python
 import sglang as sgl
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 907da4525..fc06158a4 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1530,6 +1530,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                         all_to_quantized_module_names.remove(m.tmp_name)
                 if not self.immediate_saving:
                     mv_module_from_gpu(block)
+                clear_memory(device_list=self.device_list)
                 pbar.update(1)
 
         pbar.close()
@@ -1539,7 +1540,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
             if self.super_group_size is not None:
                 dtype = torch.float32
             self._quantize_layer_via_rtn(name, dtype=dtype)
-            clear_memory(device_list=self.device_list)
+            # clear_memory(device_list=self.device_list)
 
     def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]:
         keys = inputs.keys()
diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
index 3ac31932d..64312d332 100644
--- a/auto_round/export/export_to_gguf/convert.py
+++ b/auto_round/export/export_to_gguf/convert.py
@@ -50,7 +50,7 @@
 
 from auto_round.export.export_to_gguf.config import ModelType
 from auto_round.export.export_to_gguf.packing import ggml_quant
-from auto_round.utils import LazyImport, get_module, get_packing_device, is_fp8_model, logger
+from auto_round.utils import LazyImport, get_module, get_packing_device, is_fp8_model, logger, clear_memory
 
 gguf = LazyImport("gguf")
 
@@ -598,6 +598,8 @@ def prepare_tensors(cls):
             logger.info(
                 f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype}" f" --> {data_qtype.name}, shape = {shape_str}"
             )
+            if not (hasattr(cls, "current_packing_block") and cls.current_packing_block is not None):
+                clear_memory(device_list=[orig_device])
 
             cls.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
 
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
index 890a93880..140776087 100644
--- a/auto_round/export/export_to_gguf/export.py
+++ b/auto_round/export/export_to_gguf/export.py
@@ -178,7 +178,7 @@ def pack_gguf_layer(
             last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()}
             model.last_layer_name_to_block_name = last_layer_name_to_block_name
     if name in model.last_layer_name_to_block_name:
-        ##packing block
+        # Packing block
         for gguf_model in gguf_model_instance_global:
             gguf_model.current_packing_block = model.last_layer_name_to_block_name[name]
             gguf_model.prepare_tensors()
@@ -189,7 +189,6 @@ def pack_gguf_layer(
                 m.weight = None
             if hasattr(m, "bias"):
                 m.bias = None
-        clear_memory()
         model.last_layer_name_to_block_name.pop(name)
         if len(model.last_layer_name_to_block_name) == 0:
             for gguf_model in gguf_model_instance_global:
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 81b549a4c..0bd76db04 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -528,9 +528,9 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         mins = wmin.reshape((-1, QK_K // 16))
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
-        output_scale = torch.round(scales * get_reciprocal(output_d)).clip(0, 15).to(torch.uint8)
-        output_scale |= torch.round(mins * get_reciprocal(output_dmin)).clip(0, 15).to(torch.uint8) << 4
-        all_L = torch.round((blocks + mins.unsqueeze(-1)) / scales.unsqueeze(-1)).clip(0, 3).to(torch.uint8)
+        output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
+        output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
+        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).clamp_(0,3).to(torch.uint8)
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True)
         max_scales = torch.max(scales, dim=-1, keepdim=True)[0]
@@ -556,10 +556,16 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
 
         replace_ids = d_tmp != 0
         all_L[replace_ids] = (
-            torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1))
-            .clip(0, 3)
+            blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).round_().div_(d_tmp[replace_ids].unsqueeze(-1))
+            .clamp_(0, 3)
             .to(torch.uint8)
         )
+
+        # all_L[replace_ids] = (
+        #     torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1))
+        #     .clip(0, 3)
+        #     .to(torch.uint8)
+        # )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
@@ -573,9 +579,9 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         mins = mins.reshape((-1, QK_K // 16))
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
-        output_scale = torch.round(scales * get_reciprocal(output_d)).clip(0, 15).to(torch.uint8)
-        output_scale |= torch.round(mins * get_reciprocal(output_dmin)).clip(0, 15).to(torch.uint8) << 4
-        all_L = torch.round((blocks + mins.unsqueeze(-1)) / scales.unsqueeze(-1)).clip(0, 3).to(torch.uint8)
+        output_scale = scales.mul(get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
+        output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
+        all_L = blocks.add_(mins.unsqueeze(-1)).round_().div_(scales.unsqueeze(-1)).clamp_(0, 3).to(torch.uint8)
 
     output_scale = output_scale.cpu().numpy()
     all_L = all_L.reshape(-1, 4, 32)
@@ -815,19 +821,22 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
     if scale is not None:
         scales = scale.reshape(-1, QK_K // 16)
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
-        output_scale = torch.round(scales * get_reciprocal(output_d)).clip(max=127).to(torch.int8)
-        all_L = torch.round(blocks * get_reciprocal(scales.unsqueeze(-1)) + 32).clip(0, 63).to(torch.uint8)
+        rd = get_reciprocal(output_d)
+        output_scale = scales.mul(rd).round_().clamp_(max=127).to(torch.int8)
+        rs = get_reciprocal(scales).unsqueeze_(-1)  # inplace unsqueeze
+        all_L = blocks.mul(rs).add_(32).round_().clamp_(0, 63).to(torch.uint8)
     elif original:
         scales, all_L = make_qx_quants(blocks, bits=6, rmse_type=1, qw=None)
         imax = abs(scales).argmax(dim=-1, keepdim=True)
         max_scales = torch.take_along_dim(scales, imax, dim=-1)
+
         iscales = -128 * get_reciprocal(max_scales)
         output_d = get_reciprocal(iscales)
-        output_scale = torch.round(iscales * scales).clip(max=127).to(torch.int8)
+        output_scale = (iscales * scales).round_().clamp_(max=127).to(torch.int8)
         d_tmp = output_d * output_scale.to(torch.float32)
         replace_ids = d_tmp != 0
         all_L[replace_ids] = (
-            torch.round(blocks[replace_ids] / d_tmp[replace_ids].reshape(-1, 1) + 32).clip(0, 63).to(torch.uint8)
+            (blocks[replace_ids] / d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8)
         )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
@@ -838,8 +847,8 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         scales = scales.reshape((-1, QK_K // 16))
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
-        output_scale = torch.round(scales * get_reciprocal(output_d)).clip(max=127).to(torch.int8)
-        all_L = torch.round(blocks * get_reciprocal(scales.unsqueeze(-1)) + 32).clip(0, 63).to(torch.uint8)
+        output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(max=127).to(torch.int8)
+        all_L = blocks.mul_(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8)
 
     tmp_L = all_L.reshape(nb, 4, 64) & 0xF
     output_ql = (tmp_L[:, ::2] | (tmp_L[:, 1::2] << 4)).reshape(nb, QK_K // 2).cpu().numpy().astype(np.uint8)

From 67a1b3433f4a0481de08f4d93e01adfce4f03dec Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Nov 2025 09:00:36 +0000
Subject: [PATCH 06/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_gguf/convert.py | 2 +-
 auto_round/export/export_to_gguf/packing.py | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
index 64312d332..c075bbe7c 100644
--- a/auto_round/export/export_to_gguf/convert.py
+++ b/auto_round/export/export_to_gguf/convert.py
@@ -50,7 +50,7 @@
 
 from auto_round.export.export_to_gguf.config import ModelType
 from auto_round.export.export_to_gguf.packing import ggml_quant
-from auto_round.utils import LazyImport, get_module, get_packing_device, is_fp8_model, logger, clear_memory
+from auto_round.utils import LazyImport, clear_memory, get_module, get_packing_device, is_fp8_model, logger
 
 gguf = LazyImport("gguf")
 
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 0bd76db04..fd291e815 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -530,7 +530,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
         output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
-        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).clamp_(0,3).to(torch.uint8)
+        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).clamp_(0, 3).to(torch.uint8)
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True)
         max_scales = torch.max(scales, dim=-1, keepdim=True)[0]
@@ -556,7 +556,10 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
 
         replace_ids = d_tmp != 0
         all_L[replace_ids] = (
-            blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).round_().div_(d_tmp[replace_ids].unsqueeze(-1))
+            blocks[replace_ids]
+            .add_(dm_tmp[replace_ids].unsqueeze(-1))
+            .round_()
+            .div_(d_tmp[replace_ids].unsqueeze(-1))
             .clamp_(0, 3)
             .to(torch.uint8)
         )

From 2468f0a42ea504792aec44f1dfa0d71666330248 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 18:23:33 +0800
Subject: [PATCH 07/57] fix bug

---
 auto_round/utils/device.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 2f63a3a2d..cb27a9be8 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -416,14 +416,15 @@ def _clear_memory_for_cpu_and_cuda(
         del tensor
     gc.collect()
     if torch.cuda.is_available():
-        if device_list is None:
+        if  not device_list:
             torch.cuda.synchronize()
             # Fix https://github.com/intel/auto-round/issues/1004
             torch.cuda.empty_cache()
 
-        elif len(device_list) > 1:
+        elif len(device_list) >= 1:
             devices = []
             for device in device_list:
+                device = str(device)
                 if not device.startswith("cuda"):
                     continue
                 if ":" in device:
@@ -440,6 +441,7 @@ def _clear_memory_for_cpu_and_cuda(
 
 @torch._dynamo.disable()
 def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None):
+    logger.info("call")
     from auto_round.utils.device import is_hpex_available
 
     if is_hpex_available():

From 0bd2cf92a2937233c39f57402dcfa8af5440fe16 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Nov 2025 10:24:27 +0000
Subject: [PATCH 08/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/utils/device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index cb27a9be8..999b98329 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -416,7 +416,7 @@ def _clear_memory_for_cpu_and_cuda(
         del tensor
     gc.collect()
     if torch.cuda.is_available():
-        if  not device_list:
+        if not device_list:
             torch.cuda.synchronize()
             # Fix https://github.com/intel/auto-round/issues/1004
             torch.cuda.empty_cache()

From 77014877a9369f213db3a3ad938c1efee9a745ea Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 19:42:18 +0800
Subject: [PATCH 09/57] update

---
 auto_round/compressors/base.py              | 11 +++--
 auto_round/data_type/gguf.py                | 39 +++++++++--------
 auto_round/data_type/int.py                 |  7 +--
 auto_round/export/export_to_gguf/packing.py | 47 +++++++++++++++------
 auto_round/utils/device.py                  |  2 +-
 5 files changed, 64 insertions(+), 42 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index fc06158a4..7d39f2696 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1129,7 +1129,7 @@ def _quantize_embedding_layer(self):
             del weight
             del scale
             del zp
-            clear_memory(self.device_list)
+            clear_memory(device_list = self.device_list)
 
         return is_quantized
 
@@ -1530,7 +1530,10 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                         all_to_quantized_module_names.remove(m.tmp_name)
                 if not self.immediate_saving:
                     mv_module_from_gpu(block)
-                clear_memory(device_list=self.device_list)
+                if block_name == block_names[-1]:
+                    clear_memory(input_ids, device_list=self.device_list)
+                else:
+                    clear_memory(device_list=self.device_list)
                 pbar.update(1)
 
         pbar.close()
@@ -2846,7 +2849,7 @@ def _quantize_block(
             if auto_offload:
                 mv_module_from_gpu(block)
 
-            clear_memory(input_ids)
+            clear_memory(input_ids,device_list=self.device_list)
 
             return q_outputs, output
         else:
@@ -2854,7 +2857,7 @@ def _quantize_block(
                 accelerate.hooks.remove_hook_from_submodules(block)
             if auto_offload:
                 mv_module_from_gpu(block)
-            clear_memory(input_ids)
+            clear_memory(input_ids,device_list=self.device_list)
 
             return None, output
 
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 8ffe4ee58..bc6669837 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -188,12 +188,12 @@ def double_quant_tensor_sym_rtn(tensor, bits):
     inverse_scale = get_reciprocal(scale)
 
     # Inplace quantization
-    qdq_tensor = tensor.mul_(inverse_scale)  # tensor * inverse_scale inplace
-    qdq_tensor = torch.round(qdq_tensor)  # round inplace
-    qdq_tensor.clamp_(-maxq, maxq - 1)  # clamp inplace
-    qdq_tensor.mul_(scale)  # multiply scale inplace
+    tensor = tensor.mul_(inverse_scale)  # tensor * inverse_scale inplace
+    tensor = tensor.round_()  # round inplace
+    tensor.clamp_(-maxq, maxq - 1)  # clamp inplace
+    tensor.mul_(scale)  # multiply scale inplace
 
-    return qdq_tensor, scale
+    return tensor, scale
 
 
 def make_qp_quants(nmax, data, quant_weights):
@@ -448,13 +448,13 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri
 def quant_tensor_gguf_asym_dq(
     tensor: torch.Tensor,
     bits: int = 4,
-    v=0,
     scale_dtype=torch.float16,
     imatrix=None,
     scale=None,
     wmin=None,
     d_scale=None,
     d_wmin=None,
+    split_num=None,
     **kwargs,
 ):
     """Quantizes and dequantizes a tensor using asymmetric integer quantization for formats like Q2_K, Q4_K, and Q5_K.
@@ -473,11 +473,12 @@ def quant_tensor_gguf_asym_dq(
     orig_dtype = tensor.dtype
     maxq = 2**bits - 1
     group_size = 16 if bits == 2 else 32
-    split_num = 1
-    for dim in tensor.shape:
-        if dim > 100_000:
-            split_num = 16
-            break
+    if split_num is None:
+        split_num = 1
+        for dim in tensor.shape:
+            if dim > 100_000:
+                split_num = 16
+                break
 
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
 
@@ -674,7 +675,7 @@ def iterative_wls_quant_search(
 def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         if bits == 3:
-            scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True)
+            scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) #TODO split num
             ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None)
         elif bits == 6:
             scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num)
@@ -687,6 +688,8 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
         quant_weights = _imatrix_handle_zero(quant_weights, tensor, bits)
 
         scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights, split_num=split_num)
+    if split_num>1:
+        clear_memory(device_list=[tensor.device])
     return scale
 
 
@@ -698,6 +701,7 @@ def quant_tensor_gguf_sym_dq(
     scale=None,
     d_scale=None,
     scale_dtype=torch.float16,
+    split_num=None,
     **kwargs,
 ):
     """Quantize and de-quantize tensor asymmetrically. For Q3_K, Q6_K.
@@ -724,11 +728,12 @@ def quant_tensor_gguf_sym_dq(
 
     maxq = 2 ** (bits - 1)
     group_size = 16
-    split_num = 1
-    for dim in tensor.shape:
-        if dim > 100_000:
-            split_num = 16
-            break
+    if split_num is None:
+        split_num = 1
+        for dim in tensor.shape:
+            if dim > 100_000:
+                split_num = 16
+                break
 
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
     orig_dtype = tensor.dtype
diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py
index 8fc6f79a0..c32646da7 100644
--- a/auto_round/data_type/int.py
+++ b/auto_round/data_type/int.py
@@ -53,11 +53,6 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5
         bits: Number of bits for quantization (e.g., 2, 3, 4, 8)
         group_size: Number of elements to share scale for quantization
         v: Rounding value perturbation
-        min_scale: Minimum scale coefficient for tensor
-        max_scale: Maximum scale coefficient for tensor
-        tensor_min (Tensor, optional): Minimum tensor value for quantization. Defaults to None.
-        tensor_max (Tensor, optional): Maximum tensor value for quantization. Defaults to None.
-        scale_dtype: dtype of the quantized scale,as most kernels only support FP16 or FP32, while this value is import
         q_scale_thresh: clip the quantized scale's magnitude to this value to improve the numerical stability
 
     Returns:
@@ -79,7 +74,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5
 
     scale = search_scales(tensor, bits, qw=imatrix)
     scale = torch.where(scale < 0, torch.clamp(scale, max=-q_scale_thresh), torch.clamp(scale, min=q_scale_thresh))
-    int_w = round_ste(tensor / scale + v)
+    int_w = torch.round(tensor / scale)
     q = torch.clamp(int_w, -maxq, maxq - 1)
     qdq_result = (scale * q).to(tensor.dtype)
     qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index fd291e815..b83039dd3 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -52,11 +52,17 @@ def ggml_quant(
 
     shape = data.shape
     n_blocks = data.nelement() // block_size
+    split_num = 1
+    for dim in data.shape:
+        if dim > 100_000:
+            split_num = 16
+            break
+
     blocks = data.reshape((n_blocks, block_size))
     quant_func = GGML_QUANT_TYPE[ggml_type]
     try:
         new_data = quant_func(
-            blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
+            blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original,split_num=split_num
         )
     except Exception:
         device = "cpu"
@@ -69,7 +75,7 @@ def ggml_quant(
         imatrix = imatrix.to(device) if imatrix is not None else imatrix
         clear_memory()
         new_data = quant_func(
-            blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
+            blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original,split_num=split_num
         )
 
     assert new_data.shape[-1] == type_size
@@ -518,9 +524,9 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray:
 
 
 @register_qtype("q2_k")
-def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs):
+def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False,split_num=None, **kwargs):
     nb = blocks.shape[0]
-
+    device=blocks.device
     blocks = blocks.reshape((nb, QK_K // 16, 16))  # (nb, 16, 16)
 
     if scale is not None:
@@ -573,9 +579,16 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix)
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
+        if split_num is not None and split_num>1:
+            blocks = blocks.to("cpu")
+            scales = scales.to("cpu")
+            d_scale = d_scale.to("cpu")
+            mins = mins.to("cpu")
+            d_wmin = d_wmin.to("cpu")
+            clear_memory(device_list=[device])
 
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         scales = scales.reshape((-1, QK_K // 16))
@@ -600,7 +613,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
 
 
 @register_qtype("q3_k")
-def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs):
+def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs):
     nb = blocks.shape[0]
     blocks = blocks.reshape(nb, QK_K // 16, 16)
 
@@ -626,7 +639,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
 
         blocks = blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix)
+        blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
         scales, d_scale = scales["scale"], scales["d_scale"]
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32)
@@ -653,7 +666,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
 
 
 @register_qtype("q4_k")
-def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs):
+def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False,split_num=None, **kwargs):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
 
@@ -694,7 +707,7 @@ def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix)
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
 
@@ -733,7 +746,7 @@ def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
 
 @register_qtype("q5_k")
 def q5_k_quant_block(
-    blocks, scale=None, zp=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs
+    blocks, scale=None, zp=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
@@ -775,7 +788,7 @@ def q5_k_quant_block(
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix)
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
 
@@ -817,10 +830,10 @@ def q5_k_quant_block(
 
 
 @register_qtype("q6_k")
-def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs):
+def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None,split_num=None, **kwargs):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 16, 16))
-
+    device = blocks.device
     if scale is not None:
         scales = scale.reshape(-1, QK_K // 16)
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
@@ -845,8 +858,14 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
 
         blocks = blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix)
+        blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
         scales, d_scale = scales["scale"], scales["d_scale"]
+        if split_num is not None and split_num>1:
+            blocks = blocks.to("cpu")
+            scales = scales.to("cpu")
+            d_scale = d_scale.to("cpu")
+            clear_memory(device_list=[device])
+
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         scales = scales.reshape((-1, QK_K // 16))
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index cb27a9be8..f0f007064 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -441,7 +441,7 @@ def _clear_memory_for_cpu_and_cuda(
 
 @torch._dynamo.disable()
 def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None):
-    logger.info("call")
+    # logger.info("call")
     from auto_round.utils.device import is_hpex_available
 
     if is_hpex_available():

From 42e5cc0dd86362b82a968a988f5af271cdcca5a4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Nov 2025 11:45:13 +0000
Subject: [PATCH 10/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py              |  6 +-
 auto_round/data_type/gguf.py                |  4 +-
 auto_round/export/export_to_gguf/packing.py | 73 ++++++++++++++++-----
 3 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 7d39f2696..36aff9e40 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1129,7 +1129,7 @@ def _quantize_embedding_layer(self):
             del weight
             del scale
             del zp
-            clear_memory(device_list = self.device_list)
+            clear_memory(device_list=self.device_list)
 
         return is_quantized
 
@@ -2849,7 +2849,7 @@ def _quantize_block(
             if auto_offload:
                 mv_module_from_gpu(block)
 
-            clear_memory(input_ids,device_list=self.device_list)
+            clear_memory(input_ids, device_list=self.device_list)
 
             return q_outputs, output
         else:
@@ -2857,7 +2857,7 @@ def _quantize_block(
                 accelerate.hooks.remove_hook_from_submodules(block)
             if auto_offload:
                 mv_module_from_gpu(block)
-            clear_memory(input_ids,device_list=self.device_list)
+            clear_memory(input_ids, device_list=self.device_list)
 
             return None, output
 
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index bc6669837..822c75f9b 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -675,7 +675,7 @@ def iterative_wls_quant_search(
 def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         if bits == 3:
-            scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True) #TODO split num
+            scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True)  # TODO split num
             ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None)
         elif bits == 6:
             scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num)
@@ -688,7 +688,7 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
         quant_weights = _imatrix_handle_zero(quant_weights, tensor, bits)
 
         scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=quant_weights, split_num=split_num)
-    if split_num>1:
+    if split_num > 1:
         clear_memory(device_list=[tensor.device])
     return scale
 
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index b83039dd3..be62a1340 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -62,7 +62,15 @@ def ggml_quant(
     quant_func = GGML_QUANT_TYPE[ggml_type]
     try:
         new_data = quant_func(
-            blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original,split_num=split_num
+            blocks,
+            scale,
+            zp=zp,
+            wmin=wmin,
+            d_scale=d_scale,
+            d_wmin=d_wmin,
+            imatrix=imatrix,
+            original=original,
+            split_num=split_num,
         )
     except Exception:
         device = "cpu"
@@ -75,7 +83,15 @@ def ggml_quant(
         imatrix = imatrix.to(device) if imatrix is not None else imatrix
         clear_memory()
         new_data = quant_func(
-            blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original,split_num=split_num
+            blocks,
+            scale,
+            zp=zp,
+            wmin=wmin,
+            d_scale=d_scale,
+            d_wmin=d_wmin,
+            imatrix=imatrix,
+            original=original,
+            split_num=split_num,
         )
 
     assert new_data.shape[-1] == type_size
@@ -524,9 +540,11 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray:
 
 
 @register_qtype("q2_k")
-def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False,split_num=None, **kwargs):
+def q2_k_quant_block(
+    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs
+):
     nb = blocks.shape[0]
-    device=blocks.device
+    device = blocks.device
     blocks = blocks.reshape((nb, QK_K // 16, 16))  # (nb, 16, 16)
 
     if scale is not None:
@@ -579,10 +597,12 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(
+            blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+        )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
-        if split_num is not None and split_num>1:
+        if split_num is not None and split_num > 1:
             blocks = blocks.to("cpu")
             scales = scales.to("cpu")
             d_scale = d_scale.to("cpu")
@@ -613,7 +633,9 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
 
 
 @register_qtype("q3_k")
-def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs):
+def q3_k_quant_block(
+    blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs
+):
     nb = blocks.shape[0]
     blocks = blocks.reshape(nb, QK_K // 16, 16)
 
@@ -639,7 +661,9 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
 
         blocks = blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
+        blocks, scales, _ = quant_tensor_gguf_sym_dq(
+            blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+        )
         scales, d_scale = scales["scale"], scales["d_scale"]
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32)
@@ -666,7 +690,9 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
 
 
 @register_qtype("q4_k")
-def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False,split_num=None, **kwargs):
+def q4_k_quant_block(
+    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs
+):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
 
@@ -707,7 +733,9 @@ def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(
+            blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+        )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
 
@@ -746,7 +774,16 @@ def q4_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
 
 @register_qtype("q5_k")
 def q5_k_quant_block(
-    blocks, scale=None, zp=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs
+    blocks,
+    scale=None,
+    zp=None,
+    wmin=None,
+    d_scale=None,
+    d_wmin=None,
+    imatrix=None,
+    original=False,
+    split_num=None,
+    **kwargs,
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
@@ -788,7 +825,9 @@ def q5_k_quant_block(
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(
+            blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+        )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
 
@@ -830,7 +869,9 @@ def q5_k_quant_block(
 
 
 @register_qtype("q6_k")
-def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None,split_num=None, **kwargs):
+def q6_k_quant_block(
+    blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs
+):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 16, 16))
     device = blocks.device
@@ -858,9 +899,11 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
 
         blocks = blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix,split_num=split_num)
+        blocks, scales, _ = quant_tensor_gguf_sym_dq(
+            blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+        )
         scales, d_scale = scales["scale"], scales["d_scale"]
-        if split_num is not None and split_num>1:
+        if split_num is not None and split_num > 1:
             blocks = blocks.to("cpu")
             scales = scales.to("cpu")
             d_scale = d_scale.to("cpu")

From 1307dddb23e14a530764f679591adaca4328fac9 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 19:49:05 +0800
Subject: [PATCH 11/57] git push

---
 auto_round/utils/device.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 782c5f76d..eec60ff45 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -436,6 +436,7 @@ def _clear_memory_for_cpu_and_cuda(
                 torch.cuda.synchronize(device)
             torch.cuda.empty_cache()
     if torch.xpu.is_available():
+        torch.xpu.synchronize()
         torch.xpu.empty_cache()
 
 

From 6d6d86ab8e8df6587772c49001ce3a2462397c4e Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 21:40:32 +0800
Subject: [PATCH 12/57] fix accuracy bug

---
 auto_round/export/export_to_gguf/packing.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index be62a1340..d3c5a38e4 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -482,7 +482,7 @@ def q5_0_quant_block(blocks: np.array, scale=None, zp=None, **kwargs):
     block_size = GGML_QUANT_SIZES["q5_0"][0]
 
     # FIXME: Q5_0's reference rounding is cursed and depends on FMA
-    q = torch.trunc(blocks.to(torch.float64) * id.to(torch.float64) + 16.5).clip(0, 31).to(torch.uint8).cpu().numpy()
+    q = torch.trunc(blocks.to(torch.float64) * id.to(torch.float64) + 16.5).clamp_(0, 31).to(torch.uint8).cpu().numpy()
 
     qs = q.reshape((n_blocks, 2, block_size // 2))
     qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -508,7 +508,7 @@ def q5_1_quant_block(blocks: np.array, scale=None, zp=None, **kwargs):
     block_size = GGML_QUANT_SIZES["q5_1"][0]
 
     id = get_reciprocal(d)
-    q = torch.trunc((blocks - min) * id + 0.5).clip(0, 31).to(torch.uint8).cpu().numpy()
+    q = torch.trunc(blocks.sub_(min).mul_(id).add_ (0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
 
     qs = q.reshape((n_blocks, 2, block_size // 2))
     qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -528,7 +528,6 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray:
     else:
         d = torch.abs(blocks).max(dim=1, keepdim=True)[0] / 127
     id = get_reciprocal(d)
-
     qs = torch.clip(torch_roundf(blocks * id), -128, 127)
 
     # (n_blocks, 2)
@@ -554,7 +553,7 @@ def q2_k_quant_block(
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
         output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
-        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).clamp_(0, 3).to(torch.uint8)
+        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True)
         max_scales = torch.max(scales, dim=-1, keepdim=True)[0]
@@ -582,8 +581,7 @@ def q2_k_quant_block(
         all_L[replace_ids] = (
             blocks[replace_ids]
             .add_(dm_tmp[replace_ids].unsqueeze(-1))
-            .round_()
-            .div_(d_tmp[replace_ids].unsqueeze(-1))
+            .div_(d_tmp[replace_ids].unsqueeze(-1)).round_()
             .clamp_(0, 3)
             .to(torch.uint8)
         )
@@ -617,7 +615,7 @@ def q2_k_quant_block(
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         output_scale = scales.mul(get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
         output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
-        all_L = blocks.add_(mins.unsqueeze(-1)).round_().div_(scales.unsqueeze(-1)).clamp_(0, 3).to(torch.uint8)
+        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
 
     output_scale = output_scale.cpu().numpy()
     all_L = all_L.reshape(-1, 4, 32)

From e2586f95c378bf65ff3189214b322ec45f7cdff8 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 21:47:55 +0800
Subject: [PATCH 13/57] trigger ut

---
 auto_round/export/export_to_gguf/packing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index d3c5a38e4..9e3d92845 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -535,6 +535,7 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray:
     # (n_blocks, block_size)
     qs = qs.cpu().numpy().astype(np.int8).view(np.uint8)
 
+
     return np.concatenate([d, qs], axis=1)
 
 

From c7b3c241ad2f0644162dde5e7554336253ede357 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 18 Nov 2025 21:53:12 +0800
Subject: [PATCH 14/57] clean code

---
 auto_round/compressors/base.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 36aff9e40..48f082f3a 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1086,14 +1086,15 @@ def _quantize_embedding_layer(self):
 
             quant_func = QUANT_FUNC_WITH_DTYPE[dtype]
             dtype = module.weight.dtype
-            # As typically float32 are used in RTN to search scale zp, to avoid cache a bf16 copy we'd better use float32
+            # As typically float32 are used in RTN to search scale zp,
+            # to avoid cache a bf16 copy we'd better use float32
             if config["super_group_size"] is not None:
                 dtype = torch.float32
 
             # Attempt quantization on GPU, fall back to CPU if OOM
             try:
                 weight, scale, zp = quant_func(
-                    module.weight.to(dtype).to(self.device),  #
+                    module.weight.to(dtype).to(self.device),
                     **{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]},
                 )
             except torch.OutOfMemoryError:
@@ -1474,12 +1475,6 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                     input_others[key] = val.to(tmp_dtype)
                 elif isinstance(val, list):
                     input_others[key] = [to_dtype(v, tmp_dtype) for v in val]
-            # for name in ["lm_head"]:
-            #     dtype = None
-            #     if self.super_group_size is not None:
-            #         dtype = torch.float32
-            #     self._quantize_layer_via_rtn(name, dtype=dtype)
-            #     clear_memory(device_list=self.device_list)
 
             for block_name in block_names:
                 pbar.set_description(f"Quantizing {block_name}")

From 8ad2019e3019c03b316d0e0242c516bebb524d07 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 11:06:00 +0800
Subject: [PATCH 15/57] q80 q4k

---
 auto_round/export/export_to_gguf/packing.py | 60 ++++++++++-----------
 auto_round/utils/device.py                  | 56 +++++++++++++------
 2 files changed, 69 insertions(+), 47 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 9e3d92845..8fd78d4b2 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -72,7 +72,8 @@ def ggml_quant(
             original=original,
             split_num=split_num,
         )
-    except Exception:
+    except torch.OutOfMemoryError:
+        orig_device = blocks.device
         device = "cpu"
         blocks = blocks.to(device)
         scale = scale.to(device) if scale is not None else scale
@@ -81,7 +82,7 @@ def ggml_quant(
         d_scale = d_scale.to(device) if d_scale is not None else d_scale
         d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
         imatrix = imatrix.to(device) if imatrix is not None else imatrix
-        clear_memory()
+        clear_memory(device_list=orig_device)
         new_data = quant_func(
             blocks,
             scale,
@@ -103,7 +104,7 @@ def ggml_quant(
 def torch_roundf(n):
     a = torch.abs(n)
     floored = torch.floor(a)
-    b = floored + torch.floor(2 * (a - floored))
+    b = floored + torch.floor((a - floored).mul_(2))
     return torch.sign(n) * b
 
 
@@ -528,14 +529,14 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray:
     else:
         d = torch.abs(blocks).max(dim=1, keepdim=True)[0] / 127
     id = get_reciprocal(d)
-    qs = torch.clip(torch_roundf(blocks * id), -128, 127)
+    blocks = blocks.mul_(id)
+    qs = torch_roundf(blocks).clamp_(-128, 127)
 
     # (n_blocks, 2)
     d = d.cpu().numpy().astype(np.float16).view(np.uint8)
     # (n_blocks, block_size)
     qs = qs.cpu().numpy().astype(np.int8).view(np.uint8)
 
-
     return np.concatenate([d, qs], axis=1)
 
 
@@ -587,11 +588,6 @@ def q2_k_quant_block(
             .to(torch.uint8)
         )
 
-        # all_L[replace_ids] = (
-        #     torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1))
-        #     .clip(0, 3)
-        #     .to(torch.uint8)
-        # )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
@@ -695,20 +691,17 @@ def q4_k_quant_block(
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
 
-    output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device)
 
     if scale is not None:
         scales = scale.reshape(-1, QK_K // 32)
         mins = wmin.reshape(-1, QK_K // 32)
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
-        q_scales = torch.round(scales * get_reciprocal(output_d)).clip(0, 63).to(torch.uint8)
-        q_mins = torch.round(mins * get_reciprocal(output_dmin)).clip(0, 63).to(torch.uint8)
-        all_L = (
-            torch.round((blocks + mins.unsqueeze(-1)) * get_reciprocal(scales.unsqueeze(-1)))
-            .clip(0, 15)
-            .to(torch.uint8)
-        )
+        q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
+        q_mins = (mins * get_reciprocal(output_dmin)).round_().clam_(0, 63).to(torch.uint8)
+        all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).
+                 round_().clamp_(0,15).to(torch.uint8))
+
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=4, rmin=-1, rdelta=0.1, nstep=20, use_mad=False)
         max_scales = torch.max(scales, dim=-1, keepdim=True)[0]
@@ -717,17 +710,15 @@ def q4_k_quant_block(
         id_mins = (63 * get_reciprocal(max_mins)).clamp(min=0)
         output_d = max_scales / 63
         output_dmin = max_mins / 63
-        q_scales = torch.round(id_scales * scales).clip(0, 63).to(torch.uint8)
-        q_mins = torch.round(id_mins * mins).clip(0, 63).to(torch.uint8)
+        q_scales = (id_scales * scales).round_().clamp_(0, 63).to(torch.uint8)
+        q_mins = (id_mins * mins).round_().clip(0, 63).to(torch.uint8)
 
         d_tmp = output_d * q_scales
         dm_tmp = output_dmin * q_mins
         replace_ids = d_tmp != 0
-        all_L[replace_ids] = (
-            torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1))
-            .clip(0, 15)
-            .to(torch.uint8)
-        )
+        all_L[replace_ids] = (blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).
+                              div_(d_tmp[replace_ids].unsqueeze(-1)).clamp_(0,15).to(torch.uint8))
+
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
@@ -737,20 +728,28 @@ def q4_k_quant_block(
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
+        if split_num > 1:
+            orig_device = blocks.device
+            blocks = blocks.to("cpu")
+            scales = scales.to("cpu")
+            d_scale = d_scale.to("cpu")
+            mins = mins.to("cpu")
+            d_wmin = d_wmin.to("cpu")
+
 
         blocks = blocks.reshape((nb, QK_K // 32, 32))
         scales = scales.reshape((-1, QK_K // 32))
         mins = mins.reshape((-1, QK_K // 32))
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
-        q_scales = torch.round(scales * get_reciprocal(output_d)).clip(0, 63).to(torch.uint8)
-        q_mins = torch.round(mins * get_reciprocal(output_dmin)).clip(0, 63).to(torch.uint8)
+        q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
+        q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            torch.round((blocks + mins.unsqueeze(-1)) * get_reciprocal(scales.unsqueeze(-1)))
-            .clip(0, 15)
+            blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_()
+            .clamp(0, 15)
             .to(torch.uint8)
         )
-
+    output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device)
     output_scale[:, :4] = q_scales[:, :4]
     output_scale[:, 4:8] = q_mins[:, :4]
 
@@ -906,7 +905,6 @@ def q6_k_quant_block(
             blocks = blocks.to("cpu")
             scales = scales.to("cpu")
             d_scale = d_scale.to("cpu")
-            clear_memory(device_list=[device])
 
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         scales = scales.reshape((-1, QK_K // 16))
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index eec60ff45..fddb5c969 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -407,39 +407,63 @@ def bytes_to_gigabytes(bytes) -> int:
 
 
 def _clear_memory_for_cpu_and_cuda(
-    tensor: torch.Tensor | list[torch.Tensor] | None = None, device_list: tuple | list | None = None
+    tensor: torch.Tensor | list[torch.Tensor] | None = None,
+    device_list: tuple | list | str | torch.device | None = None
 ):
+    # ------------------------
+    # Clear CPU-side references
+    # ------------------------
     if isinstance(tensor, list):
         for i in range(len(tensor)):
             tensor[i] = None
-    if tensor is not None:
-        del tensor
+    tensor = None
     gc.collect()
+
+    # ------------------------
+    # Normalize device_list
+    # ------------------------
+    if isinstance(device_list, (str, torch.device)):
+        device_list = [device_list]
+
+    # -----------------------------------
+    # CUDA-specific clearing
+    # -----------------------------------
     if torch.cuda.is_available():
+        # No device_list → clear all GPUs
         if not device_list:
-            torch.cuda.synchronize()
             # Fix https://github.com/intel/auto-round/issues/1004
+            torch.cuda.synchronize()
             torch.cuda.empty_cache()
-
-        elif len(device_list) >= 1:
+        else:
+            # Parse valid CUDA device IDs
             devices = []
-            for device in device_list:
-                device = str(device)
-                if not device.startswith("cuda"):
+            for dev in device_list:
+                dev = str(dev)
+                if not dev.startswith("cuda"):
                     continue
-                if ":" in device:
-                    device = device.split(":")[-1]
+                # cuda / cuda:0 / cuda:1
+                if ":" in dev:
+                    devid = int(dev.split(":")[-1])
                 else:
-                    device = 0
-                devices.append(int(device))
-            for device in devices:
-                torch.cuda.synchronize(device)
+                    devid = 0
+                devices.append(devid)
+
+            for d in devices:
+                torch.cuda.synchronize(d)
+
             torch.cuda.empty_cache()
-    if torch.xpu.is_available():
+
+    # -----------------------------------
+    # XPU-specific clearing
+    # -----------------------------------
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
         torch.xpu.synchronize()
         torch.xpu.empty_cache()
 
 
+
+
+
 @torch._dynamo.disable()
 def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None):
     # logger.info("call")

From d3168544cb259785cb351f417e426ec88a80b7ef Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Nov 2025 03:07:32 +0000
Subject: [PATCH 16/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_gguf/packing.py | 29 ++++++++++++++-------
 auto_round/utils/device.py                  |  5 +---
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 8fd78d4b2..d8c2578b9 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -509,7 +509,7 @@ def q5_1_quant_block(blocks: np.array, scale=None, zp=None, **kwargs):
     block_size = GGML_QUANT_SIZES["q5_1"][0]
 
     id = get_reciprocal(d)
-    q = torch.trunc(blocks.sub_(min).mul_(id).add_ (0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
+    q = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
 
     qs = q.reshape((n_blocks, 2, block_size // 2))
     qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -583,7 +583,8 @@ def q2_k_quant_block(
         all_L[replace_ids] = (
             blocks[replace_ids]
             .add_(dm_tmp[replace_ids].unsqueeze(-1))
-            .div_(d_tmp[replace_ids].unsqueeze(-1)).round_()
+            .div_(d_tmp[replace_ids].unsqueeze(-1))
+            .round_()
             .clamp_(0, 3)
             .to(torch.uint8)
         )
@@ -691,7 +692,6 @@ def q4_k_quant_block(
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
 
-
     if scale is not None:
         scales = scale.reshape(-1, QK_K // 32)
         mins = wmin.reshape(-1, QK_K // 32)
@@ -699,8 +699,13 @@ def q4_k_quant_block(
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clam_(0, 63).to(torch.uint8)
-        all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).
-                 round_().clamp_(0,15).to(torch.uint8))
+        all_L = (
+            blocks.add_(mins.unsqueeze(-1))
+            .mul_(get_reciprocal(scales.unsqueeze(-1)))
+            .round_()
+            .clamp_(0, 15)
+            .to(torch.uint8)
+        )
 
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=4, rmin=-1, rdelta=0.1, nstep=20, use_mad=False)
@@ -716,8 +721,13 @@ def q4_k_quant_block(
         d_tmp = output_d * q_scales
         dm_tmp = output_dmin * q_mins
         replace_ids = d_tmp != 0
-        all_L[replace_ids] = (blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).
-                              div_(d_tmp[replace_ids].unsqueeze(-1)).clamp_(0,15).to(torch.uint8))
+        all_L[replace_ids] = (
+            blocks[replace_ids]
+            .add_(dm_tmp[replace_ids].unsqueeze(-1))
+            .div_(d_tmp[replace_ids].unsqueeze(-1))
+            .clamp_(0, 15)
+            .to(torch.uint8)
+        )
 
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
@@ -736,7 +746,6 @@ def q4_k_quant_block(
             mins = mins.to("cpu")
             d_wmin = d_wmin.to("cpu")
 
-
         blocks = blocks.reshape((nb, QK_K // 32, 32))
         scales = scales.reshape((-1, QK_K // 32))
         mins = mins.reshape((-1, QK_K // 32))
@@ -745,7 +754,9 @@ def q4_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_()
+            blocks.add_(mins.unsqueeze(-1))
+            .mul_(get_reciprocal(scales.unsqueeze(-1)))
+            .round_()
             .clamp(0, 15)
             .to(torch.uint8)
         )
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index fddb5c969..3ab9ae72a 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -408,7 +408,7 @@ def bytes_to_gigabytes(bytes) -> int:
 
 def _clear_memory_for_cpu_and_cuda(
     tensor: torch.Tensor | list[torch.Tensor] | None = None,
-    device_list: tuple | list | str | torch.device | None = None
+    device_list: tuple | list | str | torch.device | None = None,
 ):
     # ------------------------
     # Clear CPU-side references
@@ -461,9 +461,6 @@ def _clear_memory_for_cpu_and_cuda(
         torch.xpu.empty_cache()
 
 
-
-
-
 @torch._dynamo.disable()
 def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None):
     # logger.info("call")

From 1743472bd92685c417cea544e3007bc0e3c48c3f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 12:15:40 +0800
Subject: [PATCH 17/57] q5k

---
 auto_round/data_type/gguf.py                | 187 +++++++++-----------
 auto_round/export/export_to_gguf/packing.py |  56 +++---
 2 files changed, 106 insertions(+), 137 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 822c75f9b..4912e6e77 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -497,73 +497,6 @@ def quant_tensor_gguf_asym_dq(
     return tensor, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
 
 
-def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None):
-    """Adapted from Llamacpp. Performs iterative weighted least squares quantization search.
-
-    Args:
-        data (torch.Tensor): Input tensor to quantize.
-        bits (int): Number of quantization bits.
-        rrmin (float): Initial range scaling factor.
-        rdelta (float): Step size for range scaling.
-        nstep (int): Number of search steps.
-        use_mad (bool): Whether to use mean absolute deviation instead of squared error.
-        weights (torch.Tensor): Weight matrix for each element.
-
-    Returns:
-        Tuple: (Optimal scale tensor, optimal minimum value tensor)
-    """
-    dtype = torch.float32
-    data = data.to(dtype)
-    maxq = 2**bits - 1
-    minq = 0
-    weights = 1.0 if weights is None else weights.to(dtype)
-
-    rmin = torch.min(data, dim=1, keepdim=True)[0]
-    rmax = torch.max(data, dim=1, keepdim=True)[0]
-
-    sum_w = torch.sum(weights, dim=1, keepdim=True)
-    sum_x = torch.sum(weights * data, dim=1, keepdim=True)
-
-    # scale = 1 / ((maxq - minq) / (rmax - rmin + 1e-8))
-    scale = (rmax - rmin) / (maxq - minq)
-    iscale = get_reciprocal(scale)
-    # quant_data = torch.clamp(torch.round((maxq - minq) / (rmax - rmin + 1e-8) * (data - rmin)), minq, maxq)
-    quant_data = torch.clamp(torch.round(iscale * (data - rmin)), minq, maxq)
-    diff = scale * quant_data + rmin - data
-
-    best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=1, keepdim=True)
-
-    for is_ in range(nstep):
-        factor = rrmin + rdelta * is_ + maxq - minq
-        # iscale_new = factor / (rmax - rmin + 1e-8)
-        scale_new = (rmax - rmin) / factor
-        iscale_new = get_reciprocal(scale_new)
-        quant_data_new = torch.clamp(torch.round(iscale_new * (data - rmin)), minq, maxq)
-
-        mul_weights_quant_data = weights * quant_data_new
-        sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True)
-        sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True)
-        sum_xl = torch.sum(mul_weights_quant_data * data, dim=-1, keepdim=True)
-
-        D = sum_w * sum_l2 - torch.pow(sum_l, 2)
-        this_scale = (sum_w * sum_xl - sum_x * sum_l) / D
-        this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D
-        this_min[this_min > 0] = 0
-        this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0]
-        reverse_this_scale = get_reciprocal(this_scale)
-
-        quant_data = torch.clamp(torch.round(reverse_this_scale * (data - this_min)), minq, maxq)
-        diff = this_scale * quant_data + this_min - data
-        # diff = this_scale * quant_data_new + this_min - data
-        mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=-1, keepdim=True)
-
-        idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0]
-        best_mad[idx_to_replace] = mad[idx_to_replace]
-        scale[idx_to_replace] = this_scale[idx_to_replace]
-        rmin[idx_to_replace] = this_min[idx_to_replace]
-
-    return scale.to(torch.float32), -rmin.to(torch.float32)
-
 
 # TODO consolidate iterative_wls_quant_search_chunk and non-chunk
 def iterative_wls_quant_search_chunk(
@@ -577,52 +510,99 @@ def iterative_wls_quant_search_chunk(
 
     results_scale = []
     results_rmin = []
+
     chunk_size = (data.shape[0] + split_num - 1) // split_num
+
     for start in range(0, data.shape[0], chunk_size):
         end = min(start + chunk_size, data.shape[0])
         chunk = data[start:end]
         chunk_weights = weights if isinstance(weights, float) else weights[start:end]
 
+        # Pre-allocate reusable buffers to avoid new allocations
+        tmp = torch.empty_like(chunk)
+        quant_data = torch.empty_like(chunk)
+        diff = torch.empty_like(chunk)
+
         rmin = torch.min(chunk, dim=1, keepdim=True)[0]
         rmax = torch.max(chunk, dim=1, keepdim=True)[0]
         sum_w = torch.sum(chunk_weights, dim=1, keepdim=True)
         sum_x = torch.sum(chunk_weights * chunk, dim=1, keepdim=True)
+
         scale = (rmax - rmin) / (maxq - minq)
         iscale = get_reciprocal(scale)
-        quant_data = torch.clamp(torch.round(iscale * (chunk - rmin)), minq, maxq)
-        diff = scale * quant_data + rmin - chunk
-        best_mad = torch.sum(
-            (chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=1, keepdim=True
-        )
+
+        # tmp = (chunk - rmin) * iscale
+        tmp.copy_(chunk).sub_(rmin).mul_(iscale)
+
+        # quant_data = round(tmp).clamp_()
+        torch.round(tmp, out=quant_data)
+        quant_data.clamp_(minq, maxq)
+
+        # diff = scale * quant_data + rmin - chunk
+        diff.copy_(quant_data).mul_(scale).add_(rmin).sub_(chunk)
+
+        if use_mad:
+            best_mad = (chunk_weights * diff.abs_()).sum(dim=1, keepdim=True)
+        else:
+            diff.pow_(2)
+            best_mad = (chunk_weights * diff).sum(dim=1, keepdim=True)
 
         for is_ in range(nstep):
             factor = rrmin + rdelta * is_ + maxq - minq
+
             scale_new = (rmax - rmin) / factor
             iscale_new = get_reciprocal(scale_new)
-            quant_data_new = torch.clamp(torch.round(iscale_new * (chunk - rmin)), minq, maxq)
-            mul_weights_quant_data = chunk_weights * quant_data_new
-            sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True)
-            sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True)
-            sum_xl = torch.sum(mul_weights_quant_data * chunk, dim=-1, keepdim=True)
-            D = sum_w * sum_l2 - torch.pow(sum_l, 2)
+
+            # tmp = (chunk - rmin) * iscale_new
+            tmp.copy_(chunk).sub_(rmin).mul_(iscale_new)
+
+            torch.round(tmp, out=quant_data)
+            quant_data.clamp_(minq, maxq)
+
+            # tmp = chunk_weights * quant_data
+            tmp.copy_(quant_data).mul_(chunk_weights)
+
+            sum_l = tmp.sum(dim=-1, keepdim=True)
+            sum_l2 = (tmp * quant_data).sum(dim=-1, keepdim=True)
+            sum_xl = (tmp * chunk).sum(dim=-1, keepdim=True)
+
+            D = sum_w * sum_l2 - sum_l * sum_l
+
             this_scale = (sum_w * sum_xl - sum_x * sum_l) / D
             this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D
-            this_min[this_min > 0] = 0
-            this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0]
+
+            mask = this_min > 0
+            if mask.any():
+                this_min[mask] = 0
+                this_scale[mask] = (sum_xl / sum_l2)[mask]
+
             reverse_this_scale = get_reciprocal(this_scale)
-            quant_data = torch.clamp(torch.round(reverse_this_scale * (chunk - this_min)), minq, maxq)
-            diff = this_scale * quant_data + this_min - chunk
-            mad = torch.sum(
-                (chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2),
-                dim=-1,
-                keepdim=True,
-            )
+
+            # tmp = (chunk - this_min) * reverse_this_scale
+            tmp.copy_(chunk).sub_(this_min).mul_(reverse_this_scale)
+
+            torch.round(tmp, out=quant_data)
+            quant_data.clamp_(minq, maxq)
+
+            # diff = this_scale * quant_data + this_min - chunk
+            diff.copy_(quant_data).mul_(this_scale).add_(this_min).sub_(chunk)
+
+            if use_mad:
+                mad = (chunk_weights * diff.abs_()).sum(dim=-1, keepdim=True)
+            else:
+                diff.pow_(2)
+                mad = (chunk_weights * diff).sum(dim=-1, keepdim=True)
+
             idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0]
+
             best_mad[idx_to_replace] = mad[idx_to_replace]
             scale[idx_to_replace] = this_scale[idx_to_replace]
             rmin[idx_to_replace] = this_min[idx_to_replace]
+
         results_scale.append(scale.to(torch.float32))
         results_rmin.append(-rmin.to(torch.float32))
+
+        # YOUR ORIGINAL LOGIC — kept unchanged
         if split_num > 1:
             clear_memory(device_list=[data.device])
 
@@ -648,27 +628,18 @@ def iterative_wls_quant_search(
     """
 
     # TODO this one should change to try catch later
-    if split_num > 1:
-        return iterative_wls_quant_search_chunk(
-            data=data,
-            bits=bits,
-            rrmin=rrmin,
-            rdelta=rdelta,
-            nstep=nstep,
-            use_mad=use_mad,
-            weights=weights,
-            split_num=split_num,
-        )
-    else:
-        return iterative_wls_quant_search_non_chunk(
-            data=data,
-            bits=bits,
-            rrmin=rrmin,
-            rdelta=rdelta,
-            nstep=nstep,
-            use_mad=use_mad,
-            weights=weights,
-        )
+
+    return iterative_wls_quant_search_chunk(
+        data=data,
+        bits=bits,
+        rrmin=rrmin,
+        rdelta=rdelta,
+        nstep=nstep,
+        use_mad=use_mad,
+        weights=weights,
+        split_num=split_num,
+    )
+
 
 
 @torch.no_grad()
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index d8c2578b9..912d543cf 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -640,15 +640,15 @@ def q3_k_quant_block(
     if scale is not None:
         qdq_scale = scale.reshape(-1, QK_K // 16).to(torch.float32)
         dq_scale = d_scale.reshape(-1, 1).to(torch.float32)
-        all_L = (torch.round(blocks * get_reciprocal(qdq_scale.unsqueeze(-1))).clip(-4, 3) + 4).to(torch.uint8)
-        q_scales_offset = torch.round(qdq_scale * get_reciprocal(dq_scale)).clip(-32, 31) + 32
+        all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
+        q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32)
     elif original:  ## this is correct
         scales, _ = make_q3_quants(blocks, bits=3, do_rmse=True)
         scales_abs_max = abs(scales).argmax(dim=-1, keepdim=True)
         max_scales_mag = torch.take_along_dim(scales, scales_abs_max, dim=-1)
         inverse_dq_scale = -32 * get_reciprocal(max_scales_mag)
         dq_scale = get_reciprocal(inverse_dq_scale)
-        qscale = torch.round(inverse_dq_scale * scales).clip(-32, 31)
+        qscale = (inverse_dq_scale * scales).round_().clamp_(-32, 31)
         qdq_scale = dq_scale.to(torch.float32) * qscale
         reverse_qdq_scale = get_reciprocal(qdq_scale)
         all_L = (torch.round(blocks * reverse_qdq_scale.unsqueeze(-1)).clip(-4, 3) + 4).to(torch.uint8)
@@ -687,7 +687,7 @@ def q3_k_quant_block(
 
 @register_qtype("q4_k")
 def q4_k_quant_block(
-    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs
+    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
@@ -698,7 +698,7 @@ def q4_k_quant_block(
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
-        q_mins = (mins * get_reciprocal(output_dmin)).round_().clam_(0, 63).to(torch.uint8)
+        q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
             blocks.add_(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
@@ -738,8 +738,7 @@ def q4_k_quant_block(
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
-        if split_num > 1:
-            orig_device = blocks.device
+        if split_num is not None and split_num > 1:
             blocks = blocks.to("cpu")
             scales = scales.to("cpu")
             d_scale = d_scale.to("cpu")
@@ -791,26 +790,22 @@ def q5_k_quant_block(
     d_wmin=None,
     imatrix=None,
     original=False,
-    split_num=None,
+    split_num=1,
     **kwargs,
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
 
-    output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device)
-
     if scale is not None:
         scales = scale.reshape(-1, QK_K // 32)
         mins = wmin.reshape(-1, QK_K // 32)
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
-        q_scales = torch.round(scales * get_reciprocal(output_d)).clip(0, 63).to(torch.uint8)
-        q_mins = torch.round(mins * get_reciprocal(output_dmin)).clip(0, 63).to(torch.uint8)
-        all_L = (
-            torch.round((blocks + mins.unsqueeze(-1)) * get_reciprocal(scales.unsqueeze(-1)))
-            .clip(0, 31)
-            .to(torch.uint8)
-        )
+        q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
+        q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
+        all_L  = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).
+                  round_().clamp_(0, 31).to(torch.uint8))
+
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=5, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=False)
         max_scales = torch.max(scales, dim=-1, keepdim=True)[0]
@@ -819,17 +814,14 @@ def q5_k_quant_block(
         id_mins = (63 * get_reciprocal(max_mins)).clamp(min=0)
         output_d = max_scales / 63
         output_dmin = max_mins / 63
-        q_scales = torch.round(id_scales * scales).clip(0, 63).to(torch.uint8)
-        q_mins = torch.round(id_mins * mins).clip(0, 63).to(torch.uint8)
+        q_scales = (id_scales * scales).round_().clamp_(0, 63).to(torch.uint8)
+        q_mins = (id_mins * mins).round_().clamp_(0, 63).to(torch.uint8)
 
         d_tmp = output_d * q_scales
         dm_tmp = output_dmin * q_mins
         replace_ids = d_tmp != 0
-        all_L[replace_ids] = (
-            torch.round((blocks[replace_ids] + dm_tmp[replace_ids].unsqueeze(-1)) / d_tmp[replace_ids].unsqueeze(-1))
-            .clip(0, 31)
-            .to(torch.uint8)
-        )
+        all_L[replace_ids]=(blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).
+                            div_(d_tmp[replace_ids].unsqueeze(-1)).round_().clamp_(0,31).to(torch.int8))
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
@@ -839,19 +831,25 @@ def q5_k_quant_block(
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
+        if split_num is not None and split_num > 1:
+            blocks = blocks.to("cpu")
+            scales = scales.to("cpu")
+            d_scale = d_scale.to("cpu")
+            mins = mins.to("cpu")
+            d_wmin = d_wmin.to("cpu")
 
         blocks = blocks.reshape((nb, QK_K // 32, 32))
         scales = scales.reshape((-1, QK_K // 32))
         mins = mins.reshape((-1, QK_K // 32))
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
-        q_scales = torch.round(scales * get_reciprocal(output_d)).clip(0, 63).to(torch.uint8)
-        q_mins = torch.round(mins * get_reciprocal(output_dmin)).clip(0, 63).to(torch.uint8)
-        all_L = (
-            torch.round((blocks + mins.unsqueeze(-1)) * get_reciprocal(scales.unsqueeze(-1)))
-            .clip(0, 31)
+        q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
+        q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp(0, 63).to(torch.uint8)
+        all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_()
+            .clamp_(0, 31)
             .to(torch.uint8)
         )
+    output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device)
 
     output_scale[:, :4] = q_scales[:, :4]
     output_scale[:, 4:8] = q_mins[:, :4]

From 5ffa12b61bf4e9d76c3f8d8f3edc6ce3e7ed686c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Nov 2025 04:16:24 +0000
Subject: [PATCH 18/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/data_type/gguf.py                |  2 --
 auto_round/export/export_to_gguf/packing.py | 24 ++++++++++++++++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 4912e6e77..84ef77f29 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -497,7 +497,6 @@ def quant_tensor_gguf_asym_dq(
     return tensor, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
 
 
-
 # TODO consolidate iterative_wls_quant_search_chunk and non-chunk
 def iterative_wls_quant_search_chunk(
     data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8
@@ -641,7 +640,6 @@ def iterative_wls_quant_search(
     )
 
 
-
 @torch.no_grad()
 def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 912d543cf..6779a81ef 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -803,8 +803,13 @@ def q5_k_quant_block(
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
-        all_L  = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).
-                  round_().clamp_(0, 31).to(torch.uint8))
+        all_L = (
+            blocks.add_(mins.unsqueeze(-1))
+            .mul_(get_reciprocal(scales.unsqueeze(-1)))
+            .round_()
+            .clamp_(0, 31)
+            .to(torch.uint8)
+        )
 
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=5, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=False)
@@ -820,8 +825,14 @@ def q5_k_quant_block(
         d_tmp = output_d * q_scales
         dm_tmp = output_dmin * q_mins
         replace_ids = d_tmp != 0
-        all_L[replace_ids]=(blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).
-                            div_(d_tmp[replace_ids].unsqueeze(-1)).round_().clamp_(0,31).to(torch.int8))
+        all_L[replace_ids] = (
+            blocks[replace_ids]
+            .add_(dm_tmp[replace_ids].unsqueeze(-1))
+            .div_(d_tmp[replace_ids].unsqueeze(-1))
+            .round_()
+            .clamp_(0, 31)
+            .to(torch.int8)
+        )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
@@ -845,7 +856,10 @@ def q5_k_quant_block(
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp(0, 63).to(torch.uint8)
-        all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_()
+        all_L = (
+            blocks.add_(mins.unsqueeze(-1))
+            .mul_(get_reciprocal(scales.unsqueeze(-1)))
+            .round_()
             .clamp_(0, 31)
             .to(torch.uint8)
         )

From db5c64237aebf106eda9fd797d3cc646bf7010eb Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 13:19:30 +0800
Subject: [PATCH 19/57] all ggufs use inplace ops

---
 auto_round/export/export_to_gguf/packing.py | 32 +++++++++++++--------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 912d543cf..a5fc92d38 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -433,10 +433,10 @@ def q4_0_quant_block(blocks, scale=None, zp=None, **kwargs):
         max = torch.take_along_dim(blocks, imax, dim=-1)
         d = max / -8
     id = get_reciprocal(d)
+    n_blocks = blocks.shape[0]
+    qs = torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(8.5)).clamp_(0, 15).to(torch.uint8)
 
-    qs = torch.trunc(blocks.to(torch.float64) * id.to(torch.float64) + 8.5).clip(0, 15).to(torch.uint8)
 
-    n_blocks = blocks.shape[0]
     block_size = GGML_QUANT_SIZES["q4_0"][0]
     qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy()
     qs = qs[..., 0, :] | (qs[..., 1, :] << 4)
@@ -456,10 +456,11 @@ def q4_1_quant_block(blocks, scale=None, zp=None, **kwargs):
         min = blocks.min(axis=-1, keepdims=True)[0]
         d = (max - min) / 15
     id = get_reciprocal(d)
+    n_blocks = blocks.shape[0]
+
+    qs = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8)
 
-    qs = torch.trunc((blocks - min) * id + 0.5).clip(0, 15).to(torch.uint8)
 
-    n_blocks = blocks.shape[0]
     block_size = GGML_QUANT_SIZES["q4_1"][0]
     qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy()
     qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
@@ -483,7 +484,7 @@ def q5_0_quant_block(blocks: np.array, scale=None, zp=None, **kwargs):
     block_size = GGML_QUANT_SIZES["q5_0"][0]
 
     # FIXME: Q5_0's reference rounding is cursed and depends on FMA
-    q = torch.trunc(blocks.to(torch.float64) * id.to(torch.float64) + 16.5).clamp_(0, 31).to(torch.uint8).cpu().numpy()
+    q = torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(16.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
 
     qs = q.reshape((n_blocks, 2, block_size // 2))
     qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -635,14 +636,14 @@ def q3_k_quant_block(
     nb = blocks.shape[0]
     blocks = blocks.reshape(nb, QK_K // 16, 16)
 
-    output_scale = np.empty((nb, K_SCALE_SIZE), dtype=np.uint8)
+
 
     if scale is not None:
         qdq_scale = scale.reshape(-1, QK_K // 16).to(torch.float32)
         dq_scale = d_scale.reshape(-1, 1).to(torch.float32)
         all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
         q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32)
-    elif original:  ## this is correct
+    elif original:
         scales, _ = make_q3_quants(blocks, bits=3, do_rmse=True)
         scales_abs_max = abs(scales).argmax(dim=-1, keepdim=True)
         max_scales_mag = torch.take_along_dim(scales, scales_abs_max, dim=-1)
@@ -651,8 +652,8 @@ def q3_k_quant_block(
         qscale = (inverse_dq_scale * scales).round_().clamp_(-32, 31)
         qdq_scale = dq_scale.to(torch.float32) * qscale
         reverse_qdq_scale = get_reciprocal(qdq_scale)
-        all_L = (torch.round(blocks * reverse_qdq_scale.unsqueeze(-1)).clip(-4, 3) + 4).to(torch.uint8)
-        q_scales_offset = torch.round(qdq_scale * inverse_dq_scale).clip(-32, 31) + 32
+        all_L = blocks.mul_(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
+        q_scales_offset = (qdq_scale * inverse_dq_scale).round_().clamp_(-32, 31).add_(32)
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
 
@@ -661,12 +662,19 @@ def q3_k_quant_block(
             blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
+        if split_num is not None and split_num > 1:
+            blocks = blocks.to("cpu")
+            scales = scales.to("cpu")
+            d_scale = d_scale.to("cpu")
+
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32)
         dq_scale = d_scale.reshape(-1, 1).to(torch.float32)
-        all_L = (torch.round(blocks * get_reciprocal(qdq_scale.unsqueeze(-1))).clip(-4, 3) + 4).to(torch.uint8)
-        q_scales_offset = torch.round(qdq_scale * get_reciprocal(dq_scale)).clip(-32, 31) + 32
+        all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4,3).add_(4).to(torch.uint8)
+
+        q_scales_offset =(qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32)
 
+    output_scale = np.empty((nb, K_SCALE_SIZE), dtype=np.uint8)
     q_scales_offset = q_scales_offset.cpu().numpy().astype(np.uint8)
     output_scale[:, :8] = (q_scales_offset[:, :8] & 0xF) | ((q_scales_offset[:, 8:] & 0xF) << 4)
     hmask = q_scales_offset >> 4
@@ -756,7 +764,7 @@ def q4_k_quant_block(
             blocks.add_(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
-            .clamp(0, 15)
+            .clamp_(0, 15)
             .to(torch.uint8)
         )
     output_scale = torch.empty((nb, K_SCALE_SIZE), dtype=torch.uint8, device=blocks.device)

From ec6cb4629ebae417e3a7ab420b1955b508cf1d01 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 13:47:32 +0800
Subject: [PATCH 20/57] update

---
 auto_round/data_type/gguf.py                | 11 ++--
 auto_round/data_type/int.py                 | 60 +++++++++++++++-----
 auto_round/export/export_to_gguf/packing.py | 62 ++++++++++++++++++---
 auto_round/utils/device.py                  |  1 -
 4 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 4912e6e77..e6de5ed6f 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -351,13 +351,12 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens
     return imatrix.reshape(weight.shape)
 
 
-@torch.no_grad()
+@torch.inference_mode()
 def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None, split_num=1):
     super_bits = 4 if bits == 2 else 6
     super_group_size = 16 if bits == 2 else 8
 
-    if bits not in [2, 4, 5]:
-        raise ValueError(f"bits={bits} not supported by rtn_int_asym_dq")
+
     quant_weights = None
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         search_kwargs = {
@@ -470,6 +469,8 @@ def quant_tensor_gguf_asym_dq(
     Returns:
         Tuple: (Quantized-dequantized tensor, scale dictionary, zero-point dictionary)
     """
+    if bits not in [2, 4, 5]:
+        raise ValueError(f"bits={bits} not supported by rtn_int_asym_dq")
     orig_dtype = tensor.dtype
     maxq = 2**bits - 1
     group_size = 16 if bits == 2 else 32
@@ -602,7 +603,6 @@ def iterative_wls_quant_search_chunk(
         results_scale.append(scale.to(torch.float32))
         results_rmin.append(-rmin.to(torch.float32))
 
-        # YOUR ORIGINAL LOGIC — kept unchanged
         if split_num > 1:
             clear_memory(device_list=[data.device])
 
@@ -641,8 +641,7 @@ def iterative_wls_quant_search(
     )
 
 
-
-@torch.no_grad()
+@torch.inference_mode()
 def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         if bits == 3:
diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py
index c32646da7..325c9d40a 100644
--- a/auto_round/data_type/int.py
+++ b/auto_round/data_type/int.py
@@ -22,25 +22,60 @@
 
 
 def search_scales(data: torch.Tensor, bits: int, qw: Union[None, torch.Tensor, float] = None) -> torch.Tensor:
-    nmax = pow(2, bits - 1)
+    # Maximum absolute value for symmetric quantization
+    nmax = 1 << (bits - 1)  # equivalent to pow(2, bits-1)
+
+    # Find per-group max along the last dimension
     imax = torch.abs(data).argmax(dim=-1, keepdim=True)
     group_max = torch.take_along_dim(data, imax, dim=-1)
+
+    # Compute initial inverse scales
     iscales = -nmax * get_reciprocal(group_max)
-    scales = get_reciprocal(iscales)
-    L = torch.round(1.0 * iscales * data).clip(-nmax, nmax - 1)
+    scales = get_reciprocal(iscales)  # scale = 1 / iscales
+
+    # Initial quantized values (in-place round and clamp)
+    L = torch.empty_like(data)
+    torch.round(iscales * data, out=L)
+    L.clamp_(-nmax, nmax - 1)
+
+    # Set default weight if None
     if qw is None:
         qw = 1.0
-    best_loss = torch.sum(((scales * L - data).to(torch.float32)) ** 2 * qw, dim=-1)
+
+    # Compute initial best loss
+    best_loss = ((scales * L - data).to(torch.float32)) ** 2
+    if isinstance(qw, torch.Tensor):
+        best_loss.mul_(qw)  # inplace multiply by weight
+    best_loss = torch.sum(best_loss, dim=-1)
+
+    # Iterative search over small adjustments
     for _is in range(-18 * 5, 18 * 5 + 1):
         if _is == 0:
             continue
-        iscales = -(nmax - 0.01 * _is) * get_reciprocal(group_max)
-        tmp_L = torch.round(iscales * data).clip(-nmax, nmax - 1)
-        tmp_scales = get_reciprocal(iscales)
-        loss = torch.sum(((tmp_scales * tmp_L - data).to(torch.float32)) ** 2 * qw, dim=-1)
+
+        # Update iscales in-place
+        iscales_tmp = -(nmax - 0.01 * _is) * get_reciprocal(group_max)
+
+        # Compute temporary quantized values (in-place round + clamp)
+        tmp_L = torch.empty_like(data)
+        torch.round(iscales_tmp * data, out=tmp_L)
+        tmp_L.clamp_(-nmax, nmax - 1)
+
+        # Compute temporary scales
+        tmp_scales = get_reciprocal(iscales_tmp)
+
+        # Compute temporary loss
+        loss = ((tmp_scales * tmp_L - data).to(torch.float32)) ** 2
+        if isinstance(qw, torch.Tensor):
+            loss.mul_(qw)
+        loss = torch.sum(loss, dim=-1)
+
+        # Replace scales where loss improves (in-place)
         replace_id = loss < best_loss
-        scales[replace_id] = tmp_scales[replace_id]
-        best_loss[replace_id] = loss[replace_id]
+        if replace_id.any():
+            scales[replace_id] = tmp_scales[replace_id]
+            best_loss[replace_id] = loss[replace_id]
+
     return scales
 
 
@@ -74,9 +109,8 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5
 
     scale = search_scales(tensor, bits, qw=imatrix)
     scale = torch.where(scale < 0, torch.clamp(scale, max=-q_scale_thresh), torch.clamp(scale, min=q_scale_thresh))
-    int_w = torch.round(tensor / scale)
-    q = torch.clamp(int_w, -maxq, maxq - 1)
-    qdq_result = (scale * q).to(tensor.dtype)
+    int_w =tensor.div_(scale).round_().clamp_(-maxq, maxq - 1)
+    qdq_result = (int_w.mul_(scale)).to(tensor.dtype)
     qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
     return qdq_result, scale, maxq
 
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index a5fc92d38..0952578e1 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -259,34 +259,78 @@ def make_qx_quants(data, bits, rmse_type=0, qw=None):
 
 
 def make_q3_quants(data, bits, do_rmse=False):
-    nmax = pow(2, bits - 1)
+    # Maximum absolute integer value for symmetric quantization
+    nmax = 1 << (bits - 1)  # equivalent to pow(2, bits-1)
+
+    # Find per-group max indices along last dim
     imax = abs(data).argmax(axis=-1, keepdims=True)
+
+    # Gather group-wise maximum values
     group_max = torch.take_along_dim(data, imax, dim=-1)
+
+    # Compute inverse scale in-place (multiplying by -nmax)
     iscale = -nmax * get_reciprocal(group_max)
+
     if do_rmse:
-        L = torch.round(iscale * data).clip(-nmax, nmax - 1)
-        w = torch.pow(data, 2)
+        # Initial quantization L (in-place round and clamp)
+        L = torch.empty_like(data)
+        torch.round(iscale * data, out=L)
+        L.clamp_(-nmax, nmax - 1)
+
+        # Weight for RMSE = x^2 (in-place)
+        w = data.clone().pow_(2)
+
+        # Precompute sums
         sumlx = torch.sum(w * data * L, dim=-1)
         suml2 = torch.sum(w * L * L, dim=-1)
 
-        for itry in range(5):
+        # Iterative RMSE refinement
+        for _ in range(5):
             for i in range(sumlx.shape[-1]):
-                w_tmp, data_tmp, L_tmp = w[:, :, i], data[:, :, i], L[:, :, i]
+                # Extract current slice
+                w_tmp = w[:, :, i]
+                data_tmp = data[:, :, i]
+                L_tmp = L[:, :, i]
+
+                # Exclude current slice from sums
                 slx = sumlx - w_tmp * data_tmp * L_tmp
                 replace_idx = slx > 0
-                sl2 = suml2 - w_tmp * torch.pow(L_tmp, 2)
-                new_L = torch.round(data_tmp * sl2 / slx).clip(-nmax, nmax - 1)
+                sl2 = suml2 - w_tmp * L_tmp * L_tmp
+
+                # Compute new L candidate (in-place round and clamp)
+                new_L = torch.empty_like(L_tmp)
+                torch.round(data_tmp * sl2 / slx, out=new_L)
+                new_L.clamp_(-nmax, nmax - 1)
+
+                # Identify positions to update
                 tmp_replace_idx = replace_idx & (new_L != L_tmp)
+
+                # Update sums where L changes
                 slx[tmp_replace_idx] += w_tmp[tmp_replace_idx] * data_tmp[tmp_replace_idx] * new_L[tmp_replace_idx]
                 sl2[tmp_replace_idx] += w_tmp[tmp_replace_idx] * new_L[tmp_replace_idx] * new_L[tmp_replace_idx]
+
+                # Further check condition for improvement
                 replace_idx &= (sl2 > 0) & (slx * slx * suml2 > sumlx * sumlx * sl2)
-                L[:, :, i][replace_idx] = new_L[replace_idx]
+
+                # Update L in-place
+                L_tmp[replace_idx] = new_L[replace_idx]
+
+                # Update global sums
                 sumlx = slx
                 suml2 = sl2
+
+        # Compute final scale and return quantized L
         return sumlx * get_reciprocal(suml2), L.to(torch.uint8)
 
-    L = torch.round(iscale * data).clip(-nmax, nmax - 1) + nmax
+    # Fast path: quantize without RMSE (in-place round, clamp, shift)
+    L = torch.empty_like(data)
+    torch.round(iscale * data, out=L)
+    L.clamp_(-nmax, nmax - 1)
+    L.add_(nmax)
+
+    # Compute scales (reciprocal of iscale)
     scales = get_reciprocal(iscale).reshape(iscale.shape[:2])
+
     return scales, L.to(torch.uint8)
 
 
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 3ab9ae72a..0d8413561 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -463,7 +463,6 @@ def _clear_memory_for_cpu_and_cuda(
 
 @torch._dynamo.disable()
 def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None):
-    # logger.info("call")
     from auto_round.utils.device import is_hpex_available
 
     if is_hpex_available():

From 5a503b4d9c2ad72d010ad52c0060fcb4492a4d7e Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 13:48:54 +0800
Subject: [PATCH 21/57] update

---
 auto_round/export/export_to_gguf/packing.py | 24 ++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 0952578e1..2714806ed 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -855,8 +855,13 @@ def q5_k_quant_block(
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
-        all_L  = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).
-                  round_().clamp_(0, 31).to(torch.uint8))
+        all_L = (
+            blocks.add_(mins.unsqueeze(-1))
+            .mul_(get_reciprocal(scales.unsqueeze(-1)))
+            .round_()
+            .clamp_(0, 31)
+            .to(torch.uint8)
+        )
 
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=5, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=False)
@@ -872,8 +877,14 @@ def q5_k_quant_block(
         d_tmp = output_d * q_scales
         dm_tmp = output_dmin * q_mins
         replace_ids = d_tmp != 0
-        all_L[replace_ids]=(blocks[replace_ids].add_(dm_tmp[replace_ids].unsqueeze(-1)).
-                            div_(d_tmp[replace_ids].unsqueeze(-1)).round_().clamp_(0,31).to(torch.int8))
+        all_L[replace_ids] = (
+            blocks[replace_ids]
+            .add_(dm_tmp[replace_ids].unsqueeze(-1))
+            .div_(d_tmp[replace_ids].unsqueeze(-1))
+            .round_()
+            .clamp_(0, 31)
+            .to(torch.int8)
+        )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
@@ -897,7 +908,10 @@ def q5_k_quant_block(
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp(0, 63).to(torch.uint8)
-        all_L = (blocks.add_(mins.unsqueeze(-1)).mul_(get_reciprocal(scales.unsqueeze(-1))).round_()
+        all_L = (
+            blocks.add_(mins.unsqueeze(-1))
+            .mul_(get_reciprocal(scales.unsqueeze(-1)))
+            .round_()
             .clamp_(0, 31)
             .to(torch.uint8)
         )

From 737977a992fac1b42440fcdf265121f374cfd019 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Nov 2025 05:50:03 +0000
Subject: [PATCH 22/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/data_type/gguf.py                |  1 -
 auto_round/data_type/int.py                 |  2 +-
 auto_round/export/export_to_gguf/packing.py | 16 +++++++++-------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index e580ce0a6..a0eebfb1c 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -356,7 +356,6 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri
     super_bits = 4 if bits == 2 else 6
     super_group_size = 16 if bits == 2 else 8
 
-
     quant_weights = None
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         search_kwargs = {
diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py
index 325c9d40a..8c7b1f261 100644
--- a/auto_round/data_type/int.py
+++ b/auto_round/data_type/int.py
@@ -109,7 +109,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5
 
     scale = search_scales(tensor, bits, qw=imatrix)
     scale = torch.where(scale < 0, torch.clamp(scale, max=-q_scale_thresh), torch.clamp(scale, min=q_scale_thresh))
-    int_w =tensor.div_(scale).round_().clamp_(-maxq, maxq - 1)
+    int_w = tensor.div_(scale).round_().clamp_(-maxq, maxq - 1)
     qdq_result = (int_w.mul_(scale)).to(tensor.dtype)
     qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
     return qdq_result, scale, maxq
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 2714806ed..9a885212a 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -480,7 +480,6 @@ def q4_0_quant_block(blocks, scale=None, zp=None, **kwargs):
     n_blocks = blocks.shape[0]
     qs = torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(8.5)).clamp_(0, 15).to(torch.uint8)
 
-
     block_size = GGML_QUANT_SIZES["q4_0"][0]
     qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy()
     qs = qs[..., 0, :] | (qs[..., 1, :] << 4)
@@ -504,7 +503,6 @@ def q4_1_quant_block(blocks, scale=None, zp=None, **kwargs):
 
     qs = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8)
 
-
     block_size = GGML_QUANT_SIZES["q4_1"][0]
     qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy()
     qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
@@ -528,7 +526,13 @@ def q5_0_quant_block(blocks: np.array, scale=None, zp=None, **kwargs):
     block_size = GGML_QUANT_SIZES["q5_0"][0]
 
     # FIXME: Q5_0's reference rounding is cursed and depends on FMA
-    q = torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(16.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
+    q = (
+        torch.trunc(blocks.to(torch.float64).mul_(id.to(torch.float64)).add_(16.5))
+        .clamp_(0, 31)
+        .to(torch.uint8)
+        .cpu()
+        .numpy()
+    )
 
     qs = q.reshape((n_blocks, 2, block_size // 2))
     qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -680,8 +684,6 @@ def q3_k_quant_block(
     nb = blocks.shape[0]
     blocks = blocks.reshape(nb, QK_K // 16, 16)
 
-
-
     if scale is not None:
         qdq_scale = scale.reshape(-1, QK_K // 16).to(torch.float32)
         dq_scale = d_scale.reshape(-1, 1).to(torch.float32)
@@ -714,9 +716,9 @@ def q3_k_quant_block(
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32)
         dq_scale = d_scale.reshape(-1, 1).to(torch.float32)
-        all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4,3).add_(4).to(torch.uint8)
+        all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
 
-        q_scales_offset =(qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32)
+        q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32)
 
     output_scale = np.empty((nb, K_SCALE_SIZE), dtype=np.uint8)
     q_scales_offset = q_scales_offset.cpu().numpy().astype(np.uint8)

From c3b9213b5c34af034ef291ddf47acb9c0f32d483 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 13:54:59 +0800
Subject: [PATCH 23/57] Update auto_round/export/export_to_gguf/packing.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/export/export_to_gguf/packing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 9a885212a..3995cdf16 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -770,7 +770,7 @@ def q4_k_quant_block(
         output_d = max_scales / 63
         output_dmin = max_mins / 63
         q_scales = (id_scales * scales).round_().clamp_(0, 63).to(torch.uint8)
-        q_mins = (id_mins * mins).round_().clip(0, 63).to(torch.uint8)
+        q_mins = (id_mins * mins).round_().clamp_(0, 63).to(torch.uint8)
 
         d_tmp = output_d * q_scales
         dm_tmp = output_dmin * q_mins

From c2fe2672a0dff6d132146867672f0b5bc4c8cb1c Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 13:55:45 +0800
Subject: [PATCH 24/57] Update auto_round/export/export_to_gguf/packing.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/export/export_to_gguf/packing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 3995cdf16..8da13260f 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -779,6 +779,7 @@ def q4_k_quant_block(
             blocks[replace_ids]
             .add_(dm_tmp[replace_ids].unsqueeze(-1))
             .div_(d_tmp[replace_ids].unsqueeze(-1))
+            .round_()
             .clamp_(0, 15)
             .to(torch.uint8)
         )

From 963e6f9007f40dfc607ca60e8b8140ccdc993531 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 13:56:32 +0800
Subject: [PATCH 25/57] Update auto_round/export/export_to_gguf/packing.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/export/export_to_gguf/packing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 8da13260f..47f7d24c4 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -886,7 +886,7 @@ def q5_k_quant_block(
             .div_(d_tmp[replace_ids].unsqueeze(-1))
             .round_()
             .clamp_(0, 31)
-            .to(torch.int8)
+            .to(torch.uint8)
         )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq

From 4c6366ab28f3da04f17c660040b7d9f86605a329 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 13:57:20 +0800
Subject: [PATCH 26/57] Update auto_round/export/export_to_gguf/packing.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/export/export_to_gguf/packing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 47f7d24c4..6b8a40eb1 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -910,7 +910,7 @@ def q5_k_quant_block(
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
-        q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp(0, 63).to(torch.uint8)
+        q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
             blocks.add_(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))

From 343dbb6b963f085bc2572546f7f40b148240df96 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 13:57:45 +0800
Subject: [PATCH 27/57] Update auto_round/data_type/gguf.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/data_type/gguf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index a0eebfb1c..1e9fe3256 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -643,7 +643,8 @@ def iterative_wls_quant_search(
 def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         if bits == 3:
-            scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True)  # TODO split num
+            # Note: make_q3_quants does not support split_num/chunking; 3-bit quantization is performed in a single chunk.
+            scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True)
             ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None)
         elif bits == 6:
             scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num)

From 932f407d5df926c703e78f43e4619e3377f13117 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 14:00:43 +0800
Subject: [PATCH 28/57] Update auto_round/compressors/base.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 48f082f3a..5cb487e2d 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1094,7 +1094,7 @@ def _quantize_embedding_layer(self):
             # Attempt quantization on GPU, fall back to CPU if OOM
             try:
                 weight, scale, zp = quant_func(
-                    module.weight.to(dtype).to(self.device),
+                    module.weight.to(dtype=dtype, device=self.device),
                     **{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]},
                 )
             except torch.OutOfMemoryError:

From 8304a0255d06f3823339af899a7471eb8e31e2cc Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 14:01:36 +0800
Subject: [PATCH 29/57] Update auto_round/export/export_to_gguf/packing.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/export/export_to_gguf/packing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 6b8a40eb1..c477f3b90 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -956,7 +956,7 @@ def q6_k_quant_block(
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         rd = get_reciprocal(output_d)
         output_scale = scales.mul(rd).round_().clamp_(max=127).to(torch.int8)
-        rs = get_reciprocal(scales).unsqueeze_(-1)  # inplace unsqueeze
+        rs = get_reciprocal(scales).unsqueeze_(-1)  # unsqueeze for broadcasting
         all_L = blocks.mul(rs).add_(32).round_().clamp_(0, 63).to(torch.uint8)
     elif original:
         scales, all_L = make_qx_quants(blocks, bits=6, rmse_type=1, qw=None)

From bc86fdcf4235b4bc1b023f3127c2e8e7c300ec57 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 14:03:40 +0800
Subject: [PATCH 30/57] fix by comments

---
 auto_round/data_type/gguf.py                | 2 +-
 auto_round/data_type/int.py                 | 2 +-
 auto_round/export/export_to_gguf/packing.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index a0eebfb1c..515f5f1af 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -489,7 +489,7 @@ def quant_tensor_gguf_asym_dq(
         )
 
     inverse_scale = get_reciprocal(scale)
-    tensor = tensor.add_(wmin)
+    tensor = tensor+wmin
     tensor = (tensor.mul_(inverse_scale)).round_().clamp_(0, maxq)
     tensor = tensor.mul_(scale)
     tensor = tensor.sub_(wmin).to(orig_dtype)
diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py
index 8c7b1f261..960a7fc08 100644
--- a/auto_round/data_type/int.py
+++ b/auto_round/data_type/int.py
@@ -109,7 +109,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5
 
     scale = search_scales(tensor, bits, qw=imatrix)
     scale = torch.where(scale < 0, torch.clamp(scale, max=-q_scale_thresh), torch.clamp(scale, min=q_scale_thresh))
-    int_w = tensor.div_(scale).round_().clamp_(-maxq, maxq - 1)
+    int_w = tensor.div(scale).round_().clamp_(-maxq, maxq - 1)
     qdq_result = (int_w.mul_(scale)).to(tensor.dtype)
     qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
     return qdq_result, scale, maxq
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 9a885212a..198e45476 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -967,8 +967,8 @@ def q6_k_quant_block(
         output_scale = (iscales * scales).round_().clamp_(max=127).to(torch.int8)
         d_tmp = output_d * output_scale.to(torch.float32)
         replace_ids = d_tmp != 0
-        all_L[replace_ids] = (
-            (blocks[replace_ids] / d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8)
+        all_L[replace_ids] = (blocks[replace_ids].div_(d_tmp[replace_ids]).
+                              reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8)
         )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq

From 033330e7f335b384257b06a89f3acd7af945d84f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Nov 2025 06:04:30 +0000
Subject: [PATCH 31/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/data_type/gguf.py                | 2 +-
 auto_round/export/export_to_gguf/packing.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 317721189..aa7eb6447 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -489,7 +489,7 @@ def quant_tensor_gguf_asym_dq(
         )
 
     inverse_scale = get_reciprocal(scale)
-    tensor = tensor+wmin
+    tensor = tensor + wmin
     tensor = (tensor.mul_(inverse_scale)).round_().clamp_(0, maxq)
     tensor = tensor.mul_(scale)
     tensor = tensor.sub_(wmin).to(orig_dtype)
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index b62227639..1f3b3a1f3 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -968,8 +968,8 @@ def q6_k_quant_block(
         output_scale = (iscales * scales).round_().clamp_(max=127).to(torch.int8)
         d_tmp = output_d * output_scale.to(torch.float32)
         replace_ids = d_tmp != 0
-        all_L[replace_ids] = (blocks[replace_ids].div_(d_tmp[replace_ids]).
-                              reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8)
+        all_L[replace_ids] = (
+            blocks[replace_ids].div_(d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8)
         )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq

From c905bd27cc22786d6dde260bea6cee8ead632e2f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 14:13:44 +0800
Subject: [PATCH 32/57] fix line too long

---
 auto_round/data_type/gguf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index aa7eb6447..4007b1d72 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -643,9 +643,10 @@ def iterative_wls_quant_search(
 def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num):
     if imatrix is None or (imatrix is not None and torch.sum(imatrix) == 0):
         if bits == 3:
-            # Note: make_q3_quants does not support split_num/chunking; 3-bit quantization is performed in a single chunk.
+            # Note: make_q3_quants does not support split_num/chunking;
+            # 3-bit quantization is performed in a single chunk.
             scale, int_w = make_q3_quants(tensor, bits=bits, do_rmse=True)
-            ##scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None)
+            # scale, int_w = make_qx_quants(tensor, bits=bits, rmse_type=1, qw=None)
         elif bits == 6:
             scale, int_w = make_qx_quants_chunk(tensor, bits=bits, rmse_type=1, qw=None, split_num=split_num)
     else:

From a61657937c75013f5810e38cc0e4ee47758cd6d8 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 14:20:09 +0800
Subject: [PATCH 33/57] update readme

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 367a08c7c..f0b0ebb25 100644
--- a/README.md
+++ b/README.md
@@ -184,14 +184,14 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
 <summary>Important Hyperparameters</summary>
 
 ##### Quantization Scheme & Configuration
-- **`scheme` (str|dict|AutoScheme)**: The predefined quantization keys, e.g. `W4A16`, `MXFP4`, `NVFP4`, `GGUF:Q4_K_M`.
+- **`scheme` (str|dict|AutoScheme)**: The predefined quantization keys, e.g. `W4A16`, `MXFP4`, `NVFP4`, `GGUF:Q4_K_M`. For MXFP4/NVFP4, we recommend exporting to LLM-Compressor format.
 - **`bits` (int)**: Number of bits for quantization (default is `None`). If not None, it will override the scheme setting.
 - **`group_size` (int)**: Size of the quantization group (default is `None`). If not None, it will override the scheme setting.
 - **`sym` (bool)**: Whether to use symmetric quantization (default is `None`). If not None, it will override the scheme setting.
-- **`layer_config` (dict)**: Configuration for weight quantization (default is `None`), mainly for mixed schemes.
+- **`layer_config` (dict)**: Configuration for layer_wise scheme (default is `None`), mainly for customized mixed schemes.
 
 ##### Algorithm Settings
-- **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0` Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
+- **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0`. Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
 - **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled).
 
 ##### Tuning Process Parameters

From cd01f1320a2a5a2fb34203c8ac40ee215b9a534d Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 19 Nov 2025 15:43:15 +0800
Subject: [PATCH 34/57] update

---
 auto_round/compressors/base.py              |  2 +-
 auto_round/export/export_to_gguf/convert.py | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 5cb487e2d..ada41fa65 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1362,7 +1362,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None
         )
 
-        self._quantize_embedding_layer()
+        # self._quantize_embedding_layer()
 
         self.model.to("cpu")
         # Release memory
diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
index c075bbe7c..04911292a 100644
--- a/auto_round/export/export_to_gguf/convert.py
+++ b/auto_round/export/export_to_gguf/convert.py
@@ -190,7 +190,7 @@ def is_extra_tensor(tensor_name):
 
 
 def _quant_data_with_args(data_torch, data_qtype, scale, zp, d_scale=None, wmin=None, d_wmin=None, imatrix=None):
-    device = get_packing_device()
+    device = data_torch.device
     data_torch = data_torch.to(torch.float32)
     scale = scale.to(torch.float32) if isinstance(scale, torch.Tensor) else scale
     zp = zp.to(torch.float32) if isinstance(zp, torch.Tensor) else zp
@@ -215,7 +215,7 @@ def _quant_data_with_args(data_torch, data_qtype, scale, zp, d_scale=None, wmin=
 
 def _quant_data(cls, data_torch, data_qtype, name, modify_name, bid):
     suffix = ".weight"
-    device = get_packing_device()
+    device = data_torch.device
     if suffix in name:
         layer_name = name[: -len(suffix)]
         module = get_module(cls.model, layer_name)
@@ -406,9 +406,10 @@ def prepare_tensors(cls):
 
         modify_name = _special_name_handle(cls, name)
         orig_device = data_torch.device
-        data_torch = data_torch.to("cpu")
+        import psutil, os
+        process = psutil.Process(os.getpid())
+        print(f"CPU RAM: {process.memory_info().rss / 1024 ** 2:.2f} MB")
         for new_name, data_torch in cls.modify_tensors(data_torch, modify_name, bid):
-            data_torch.to(orig_device)
             skip = False
             for tensor_info in cls.gguf_writer.tensors:
                 if new_name in tensor_info:
@@ -417,12 +418,7 @@ def prepare_tensors(cls):
                     break
             if skip:
                 continue
-            data = data_torch.squeeze().cpu().numpy()
-
-            # if data ends up empty, it means data_torch was a scalar tensor -> restore
-            if len(data.shape) == 0:
-                data = data_torch.numpy()
-
+            data = data_torch.squeeze()
             n_dims = len(data.shape)
             data_qtype: gguf.GGMLQuantizationType | bool = cls.tensor_force_quant(name, new_name, bid, n_dims)
 
@@ -537,6 +533,11 @@ def prepare_tensors(cls):
                 gguf.GGMLQuantizationType.BF16,
                 gguf.GGMLQuantizationType.F32,
             ]:
+                data = data_torch.squeeze().cpu().numpy()
+
+                # if data ends up empty, it means data_torch was a scalar tensor -> restore
+                if len(data.shape) == 0:
+                    data = data_torch.numpy()
                 try:
                     data = gguf.quants.quantize(data, data_qtype)
                 except gguf.QuantError as e:

From 876267e2fef4498a89546fa5e6b2176f8583cf4f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Nov 2025 07:44:03 +0000
Subject: [PATCH 35/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_gguf/convert.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
index 04911292a..c7bbe25fc 100644
--- a/auto_round/export/export_to_gguf/convert.py
+++ b/auto_round/export/export_to_gguf/convert.py
@@ -406,7 +406,10 @@ def prepare_tensors(cls):
 
         modify_name = _special_name_handle(cls, name)
         orig_device = data_torch.device
-        import psutil, os
+        import os
+
+        import psutil
+
         process = psutil.Process(os.getpid())
         print(f"CPU RAM: {process.memory_info().rss / 1024 ** 2:.2f} MB")
         for new_name, data_torch in cls.modify_tensors(data_torch, modify_name, bid):

From 1138c737993c985f9493622a86e74c1c98cd8ca9 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 20 Nov 2025 10:31:02 +0800
Subject: [PATCH 36/57] clean code

---
 auto_round/export/export_to_gguf/convert.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
index 04911292a..572e4bb6f 100644
--- a/auto_round/export/export_to_gguf/convert.py
+++ b/auto_round/export/export_to_gguf/convert.py
@@ -406,9 +406,6 @@ def prepare_tensors(cls):
 
         modify_name = _special_name_handle(cls, name)
         orig_device = data_torch.device
-        import psutil, os
-        process = psutil.Process(os.getpid())
-        print(f"CPU RAM: {process.memory_info().rss / 1024 ** 2:.2f} MB")
         for new_name, data_torch in cls.modify_tensors(data_torch, modify_name, bid):
             skip = False
             for tensor_info in cls.gguf_writer.tensors:

From be5e13cf6a4deacc17ae7d7487d7d97e0c1e2908 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 20 Nov 2025 16:53:59 +0800
Subject: [PATCH 37/57] update

---
 auto_round/export/export_to_gguf/packing.py | 99 +++++++++++----------
 1 file changed, 54 insertions(+), 45 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 1f3b3a1f3..cd7d9f32f 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -52,53 +52,62 @@ def ggml_quant(
 
     shape = data.shape
     n_blocks = data.nelement() // block_size
-    split_num = 1
-    for dim in data.shape:
-        if dim > 100_000:
-            split_num = 16
-            break
-
+    split_num = 16 if max(data.shape) > 100_000 else 1
     blocks = data.reshape((n_blocks, block_size))
     quant_func = GGML_QUANT_TYPE[ggml_type]
-    try:
-        new_data = quant_func(
-            blocks,
-            scale,
-            zp=zp,
-            wmin=wmin,
-            d_scale=d_scale,
-            d_wmin=d_wmin,
-            imatrix=imatrix,
-            original=original,
-            split_num=split_num,
-        )
-    except torch.OutOfMemoryError:
-        orig_device = blocks.device
-        device = "cpu"
-        blocks = blocks.to(device)
-        scale = scale.to(device) if scale is not None else scale
-        zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp
-        wmin = wmin.to(device) if wmin is not None else wmin
-        d_scale = d_scale.to(device) if d_scale is not None else d_scale
-        d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
-        imatrix = imatrix.to(device) if imatrix is not None else imatrix
-        clear_memory(device_list=orig_device)
-        new_data = quant_func(
-            blocks,
-            scale,
-            zp=zp,
-            wmin=wmin,
-            d_scale=d_scale,
-            d_wmin=d_wmin,
-            imatrix=imatrix,
-            original=original,
-            split_num=split_num,
-        )
-
-    assert new_data.shape[-1] == type_size
-    new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)
-    new_data = new_data.reshape(*shape[:-1], -1)
-    return new_data
+    results = []
+    for i in range(split_num):
+        if split_num > 1:
+            start = (n_blocks * i) // split_num
+            end = (n_blocks * (i + 1)) // split_num
+            blocks = data.reshape((n_blocks, block_size))[start:end]
+            scale = scale[start:end] if scale is not None else scale
+            zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp
+            wmin = wmin[start:end] if wmin is not None else wmin
+            d_scale = d_scale[start:end] if d_scale is not None else d_scale
+            d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
+            # imatrix = imatrix[start:end] if imatrix is not None else imatrix
+        try:
+            new_data = quant_func(
+                blocks,
+                scale,
+                zp=zp,
+                wmin=wmin,
+                d_scale=d_scale,
+                d_wmin=d_wmin,
+                imatrix=imatrix,
+                original=original,
+            )
+        except torch.OutOfMemoryError:
+            orig_device = blocks.device
+            device = "cpu"
+            blocks = blocks.to(device)
+            scale = scale.to(device) if scale is not None else scale
+            zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp
+            wmin = wmin.to(device) if wmin is not None else wmin
+            d_scale = d_scale.to(device) if d_scale is not None else d_scale
+            d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
+            imatrix = imatrix.to(device) if imatrix is not None else imatrix
+            clear_memory(device_list=orig_device)
+            new_data = quant_func(
+                blocks,
+                scale,
+                zp=zp,
+                wmin=wmin,
+                d_scale=d_scale,
+                d_wmin=d_wmin,
+                imatrix=imatrix,
+                original=original
+            )
+
+        assert new_data.shape[-1] == type_size
+        new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)
+        new_data = new_data.reshape(*shape[:-1], -1)
+        results.append(new_data)
+    if len(results)==1:
+        return results[0]
+    else:
+        return torch.cat(results, dim=0)
 
 
 def torch_roundf(n):

From 190ea033e354408de295cb2dfcc82b07fb735fbb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 20 Nov 2025 08:56:43 +0000
Subject: [PATCH 38/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_gguf/packing.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index cd7d9f32f..773b1fcf8 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -90,21 +90,14 @@ def ggml_quant(
             imatrix = imatrix.to(device) if imatrix is not None else imatrix
             clear_memory(device_list=orig_device)
             new_data = quant_func(
-                blocks,
-                scale,
-                zp=zp,
-                wmin=wmin,
-                d_scale=d_scale,
-                d_wmin=d_wmin,
-                imatrix=imatrix,
-                original=original
+                blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
             )
 
         assert new_data.shape[-1] == type_size
         new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)
         new_data = new_data.reshape(*shape[:-1], -1)
         results.append(new_data)
-    if len(results)==1:
+    if len(results) == 1:
         return results[0]
     else:
         return torch.cat(results, dim=0)

From f16cde5c010b6e403e9ccea14b240d10713de720 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 20 Nov 2025 17:40:05 +0800
Subject: [PATCH 39/57] fix typo

---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 9dc931ecf..719b45d36 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2851,7 +2851,7 @@ def _quantize_block(
             if auto_offload:
                 mv_module_from_gpu(block)
 
-            clear_memory(input_ids，device_list=self.device_list)
+            clear_memory(input_ids, device_list=self.device_list)
             memory_info_summary = memory_monitor.get_summary()
             logger.infoclean(dump_info + "," + memory_info_summary)
 

From 9f408d15ac120aac766a91390100a48fd0075c46 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 20 Nov 2025 17:57:36 +0800
Subject: [PATCH 40/57] update

---
 auto_round/data_type/gguf.py                | 27 +++-----
 auto_round/export/export_to_gguf/packing.py | 72 +++++++--------------
 auto_round/utils/device.py                  |  4 +-
 3 files changed, 35 insertions(+), 68 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 4007b1d72..f96138892 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -452,7 +452,6 @@ def quant_tensor_gguf_asym_dq(
     wmin=None,
     d_scale=None,
     d_wmin=None,
-    split_num=None,
     **kwargs,
 ):
     """Quantizes and dequantizes a tensor using asymmetric integer quantization for formats like Q2_K, Q4_K, and Q5_K.
@@ -473,12 +472,7 @@ def quant_tensor_gguf_asym_dq(
     orig_dtype = tensor.dtype
     maxq = 2**bits - 1
     group_size = 16 if bits == 2 else 32
-    if split_num is None:
-        split_num = 1
-        for dim in tensor.shape:
-            if dim > 100_000:
-                split_num = 16
-                break
+    split_num=1
 
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
 
@@ -499,7 +493,7 @@ def quant_tensor_gguf_asym_dq(
 
 # TODO consolidate iterative_wls_quant_search_chunk and non-chunk
 def iterative_wls_quant_search_chunk(
-    data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8
+    data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=1
 ):
     dtype = torch.float32
     data = data.to(dtype)
@@ -602,9 +596,11 @@ def iterative_wls_quant_search_chunk(
         results_rmin.append(-rmin.to(torch.float32))
 
         if split_num > 1:
-            clear_memory(device_list=[data.device])
-
-    return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0)
+            clear_memory(device_list=data.device)
+    if len(results_scale)>1:
+        return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0)
+    else:
+        return results_scale[0], results_rmin[0]
 
 
 def iterative_wls_quant_search(
@@ -671,7 +667,7 @@ def quant_tensor_gguf_sym_dq(
     scale=None,
     d_scale=None,
     scale_dtype=torch.float16,
-    split_num=None,
+    split_num=1,
     **kwargs,
 ):
     """Quantize and de-quantize tensor asymmetrically. For Q3_K, Q6_K.
@@ -698,13 +694,6 @@ def quant_tensor_gguf_sym_dq(
 
     maxq = 2 ** (bits - 1)
     group_size = 16
-    if split_num is None:
-        split_num = 1
-        for dim in tensor.shape:
-            if dim > 100_000:
-                split_num = 16
-                break
-
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
     orig_dtype = tensor.dtype
     super_bits = 6 if bits == 3 else 8
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 773b1fcf8..76713b538 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -56,6 +56,7 @@ def ggml_quant(
     blocks = data.reshape((n_blocks, block_size))
     quant_func = GGML_QUANT_TYPE[ggml_type]
     results = []
+    orig_device = blocks.device
     for i in range(split_num):
         if split_num > 1:
             start = (n_blocks * i) // split_num
@@ -66,6 +67,7 @@ def ggml_quant(
             wmin = wmin[start:end] if wmin is not None else wmin
             d_scale = d_scale[start:end] if d_scale is not None else d_scale
             d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
+            shape = data.shape
             # imatrix = imatrix[start:end] if imatrix is not None else imatrix
         try:
             new_data = quant_func(
@@ -79,7 +81,6 @@ def ggml_quant(
                 original=original,
             )
         except torch.OutOfMemoryError:
-            orig_device = blocks.device
             device = "cpu"
             blocks = blocks.to(device)
             scale = scale.to(device) if scale is not None else scale
@@ -92,15 +93,18 @@ def ggml_quant(
             new_data = quant_func(
                 blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
             )
-
-        assert new_data.shape[-1] == type_size
-        new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)
-        new_data = new_data.reshape(*shape[:-1], -1)
         results.append(new_data)
+        if split_num>1:
+            print(f"!!!!!!!!!!!!!{orig_device}!!!!!!!!!!!!!!!!!!!!",flush=True)
+            clear_memory(device_list=orig_device)
+
     if len(results) == 1:
-        return results[0]
+        new_data= results[0]
     else:
-        return torch.cat(results, dim=0)
+        new_data =  np.concatenate(results, axis=0)
+    new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)
+    new_data = new_data.reshape(*shape[:-1], -1)
+    return new_data
 
 
 def torch_roundf(n):
@@ -199,9 +203,12 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1):
         L_list.append(L.to(torch.uint8))
 
     # Concatenate all chunks along batch dimension
-    scales = torch.cat(scales_list, dim=0)
-    L = torch.cat(L_list, dim=0)
-    return scales, L
+    if len(scales_list)>1:
+        scales = torch.cat(scales_list, dim=0)
+        L = torch.cat(L_list, dim=0)
+        return scales, L
+    else:
+        return scales, L
 
 
 def make_qx_quants(data, bits, rmse_type=0, qw=None):
@@ -593,7 +600,7 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray:
 
 @register_qtype("q2_k")
 def q2_k_quant_block(
-    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=None, **kwargs
+    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs
 ):
     nb = blocks.shape[0]
     device = blocks.device
@@ -645,18 +652,10 @@ def q2_k_quant_block(
 
         blocks.reshape(blocks.shape[0], -1)
         blocks, scales, mins = quant_tensor_gguf_asym_dq(
-            blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+            blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
-        if split_num is not None and split_num > 1:
-            blocks = blocks.to("cpu")
-            scales = scales.to("cpu")
-            d_scale = d_scale.to("cpu")
-            mins = mins.to("cpu")
-            d_wmin = d_wmin.to("cpu")
-            clear_memory(device_list=[device])
-
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         scales = scales.reshape((-1, QK_K // 16))
         mins = mins.reshape((-1, QK_K // 16))
@@ -681,7 +680,7 @@ def q2_k_quant_block(
 
 @register_qtype("q3_k")
 def q3_k_quant_block(
-    blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs
+    blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape(nb, QK_K // 16, 16)
@@ -707,13 +706,9 @@ def q3_k_quant_block(
 
         blocks = blocks.reshape(blocks.shape[0], -1)
         blocks, scales, _ = quant_tensor_gguf_sym_dq(
-            blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+            blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
-        if split_num is not None and split_num > 1:
-            blocks = blocks.to("cpu")
-            scales = scales.to("cpu")
-            d_scale = d_scale.to("cpu")
 
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32)
@@ -791,16 +786,10 @@ def q4_k_quant_block(
 
         blocks.reshape(blocks.shape[0], -1)
         blocks, scales, mins = quant_tensor_gguf_asym_dq(
-            blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+            blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
-        if split_num is not None and split_num > 1:
-            blocks = blocks.to("cpu")
-            scales = scales.to("cpu")
-            d_scale = d_scale.to("cpu")
-            mins = mins.to("cpu")
-            d_wmin = d_wmin.to("cpu")
 
         blocks = blocks.reshape((nb, QK_K // 32, 32))
         scales = scales.reshape((-1, QK_K // 32))
@@ -847,7 +836,6 @@ def q5_k_quant_block(
     d_wmin=None,
     imatrix=None,
     original=False,
-    split_num=1,
     **kwargs,
 ):
     nb = blocks.shape[0]
@@ -895,16 +883,10 @@ def q5_k_quant_block(
 
         blocks.reshape(blocks.shape[0], -1)
         blocks, scales, mins = quant_tensor_gguf_asym_dq(
-            blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+            blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
-        if split_num is not None and split_num > 1:
-            blocks = blocks.to("cpu")
-            scales = scales.to("cpu")
-            d_scale = d_scale.to("cpu")
-            mins = mins.to("cpu")
-            d_wmin = d_wmin.to("cpu")
 
         blocks = blocks.reshape((nb, QK_K // 32, 32))
         scales = scales.reshape((-1, QK_K // 32))
@@ -948,7 +930,7 @@ def q5_k_quant_block(
 
 @register_qtype("q6_k")
 def q6_k_quant_block(
-    blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, split_num=None, **kwargs
+    blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 16, 16))
@@ -978,13 +960,9 @@ def q6_k_quant_block(
 
         blocks = blocks.reshape(blocks.shape[0], -1)
         blocks, scales, _ = quant_tensor_gguf_sym_dq(
-            blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix, split_num=split_num
+            blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix
         )
         scales, d_scale = scales["scale"], scales["d_scale"]
-        if split_num is not None and split_num > 1:
-            blocks = blocks.to("cpu")
-            scales = scales.to("cpu")
-            d_scale = d_scale.to("cpu")
 
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         scales = scales.reshape((-1, QK_K // 16))
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 496f399bb..1db1fce69 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -1367,7 +1367,7 @@ def update(self, device_list=None):
         process = psutil.Process()
         current_ram = process.memory_info().rss / 1024**3  # GB
         self.peak_ram = max(self.peak_ram, current_ram)
-        if device_list is None:  # TODO this have issue, wait for clean memory all pass device_list
+        if device_list is None:  # TODO this has issue, wait for clean_memory all pass device_list
             device_list = [0]
         if device_list is not None:
             if not isinstance(device_list, (list, tuple)):
@@ -1379,7 +1379,7 @@ def update(self, device_list=None):
                 device_list = list(range(torch.xpu.device_count()))
 
         for device in device_list:
-            if device == "cpu":
+            if str(device) == "cpu":
                 continue
             if torch.cuda.is_available():
                 current_vram = torch.cuda.memory_reserved(device) / 1024**3  # GB

From 2d059f3f8f27eb8cb238bfd42e77e76673da3f1e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 20 Nov 2025 09:58:53 +0000
Subject: [PATCH 41/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/data_type/gguf.py                |  4 +-
 auto_round/export/export_to_gguf/packing.py | 42 +++++++--------------
 2 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index f96138892..27a97f010 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -472,7 +472,7 @@ def quant_tensor_gguf_asym_dq(
     orig_dtype = tensor.dtype
     maxq = 2**bits - 1
     group_size = 16 if bits == 2 else 32
-    split_num=1
+    split_num = 1
 
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
 
@@ -597,7 +597,7 @@ def iterative_wls_quant_search_chunk(
 
         if split_num > 1:
             clear_memory(device_list=data.device)
-    if len(results_scale)>1:
+    if len(results_scale) > 1:
         return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0)
     else:
         return results_scale[0], results_rmin[0]
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 76713b538..cdfe74eed 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -94,14 +94,14 @@ def ggml_quant(
                 blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
             )
         results.append(new_data)
-        if split_num>1:
-            print(f"!!!!!!!!!!!!!{orig_device}!!!!!!!!!!!!!!!!!!!!",flush=True)
+        if split_num > 1:
+            print(f"!!!!!!!!!!!!!{orig_device}!!!!!!!!!!!!!!!!!!!!", flush=True)
             clear_memory(device_list=orig_device)
 
     if len(results) == 1:
-        new_data= results[0]
+        new_data = results[0]
     else:
-        new_data =  np.concatenate(results, axis=0)
+        new_data = np.concatenate(results, axis=0)
     new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)
     new_data = new_data.reshape(*shape[:-1], -1)
     return new_data
@@ -203,7 +203,7 @@ def make_qx_quants_chunk(data, bits, rmse_type=0, qw=None, split_num=1):
         L_list.append(L.to(torch.uint8))
 
     # Concatenate all chunks along batch dimension
-    if len(scales_list)>1:
+    if len(scales_list) > 1:
         scales = torch.cat(scales_list, dim=0)
         L = torch.cat(L_list, dim=0)
         return scales, L
@@ -599,9 +599,7 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray:
 
 
 @register_qtype("q2_k")
-def q2_k_quant_block(
-    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs
-):
+def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, **kwargs):
     nb = blocks.shape[0]
     device = blocks.device
     blocks = blocks.reshape((nb, QK_K // 16, 16))  # (nb, 16, 16)
@@ -651,9 +649,7 @@ def q2_k_quant_block(
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(
-            blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix
-        )
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix)
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
         blocks = blocks.reshape((nb, QK_K // 16, 16))
@@ -679,9 +675,7 @@ def q2_k_quant_block(
 
 
 @register_qtype("q3_k")
-def q3_k_quant_block(
-    blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs
-):
+def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs):
     nb = blocks.shape[0]
     blocks = blocks.reshape(nb, QK_K // 16, 16)
 
@@ -705,9 +699,7 @@ def q3_k_quant_block(
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
 
         blocks = blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, _ = quant_tensor_gguf_sym_dq(
-            blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix
-        )
+        blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=3, scale_dtype=torch.float32, imatrix=imatrix)
         scales, d_scale = scales["scale"], scales["d_scale"]
 
         blocks = blocks.reshape((nb, QK_K // 16, 16))
@@ -785,9 +777,7 @@ def q4_k_quant_block(
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(
-            blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix
-        )
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix)
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
 
@@ -882,9 +872,7 @@ def q5_k_quant_block(
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
         blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, mins = quant_tensor_gguf_asym_dq(
-            blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix
-        )
+        blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=4, scale_dtype=torch.float32, imatrix=imatrix)
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]
 
@@ -929,9 +917,7 @@ def q5_k_quant_block(
 
 
 @register_qtype("q6_k")
-def q6_k_quant_block(
-    blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs
-):
+def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False, imatrix=None, **kwargs):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 16, 16))
     device = blocks.device
@@ -959,9 +945,7 @@ def q6_k_quant_block(
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
 
         blocks = blocks.reshape(blocks.shape[0], -1)
-        blocks, scales, _ = quant_tensor_gguf_sym_dq(
-            blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix
-        )
+        blocks, scales, _ = quant_tensor_gguf_sym_dq(blocks, bits=6, scale_dtype=torch.float32, imatrix=imatrix)
         scales, d_scale = scales["scale"], scales["d_scale"]
 
         blocks = blocks.reshape((nb, QK_K // 16, 16))

From 78499a76523ae0a56f8f17a1bba25c23c94f9dd1 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 20 Nov 2025 19:30:48 +0800
Subject: [PATCH 42/57] try to fix ut failure

---
 auto_round/export/export_to_gguf/packing.py | 29 ++++++++++-----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index cdfe74eed..869bc1eb8 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -95,7 +95,6 @@ def ggml_quant(
             )
         results.append(new_data)
         if split_num > 1:
-            print(f"!!!!!!!!!!!!!{orig_device}!!!!!!!!!!!!!!!!!!!!", flush=True)
             clear_memory(device_list=orig_device)
 
     if len(results) == 1:
@@ -510,7 +509,7 @@ def q4_1_quant_block(blocks, scale=None, zp=None, **kwargs):
     id = get_reciprocal(d)
     n_blocks = blocks.shape[0]
 
-    qs = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8)
+    qs = torch.trunc(blocks.sub(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8)
 
     block_size = GGML_QUANT_SIZES["q4_1"][0]
     qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy()
@@ -567,7 +566,7 @@ def q5_1_quant_block(blocks: np.array, scale=None, zp=None, **kwargs):
     block_size = GGML_QUANT_SIZES["q5_1"][0]
 
     id = get_reciprocal(d)
-    q = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
+    q = torch.trunc(blocks.sub(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
 
     qs = q.reshape((n_blocks, 2, block_size // 2))
     qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -587,7 +586,7 @@ def q8_0_quant_block(blocks, scale=None, zp=None, **kwargs) -> np.ndarray:
     else:
         d = torch.abs(blocks).max(dim=1, keepdim=True)[0] / 127
     id = get_reciprocal(d)
-    blocks = blocks.mul_(id)
+    blocks = blocks.mul(id)
     qs = torch_roundf(blocks).clamp_(-128, 127)
 
     # (n_blocks, 2)
@@ -611,7 +610,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
         output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
-        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
+        all_L = blocks.add(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True)
         max_scales = torch.max(scales, dim=-1, keepdim=True)[0]
@@ -659,7 +658,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         output_scale = scales.mul(get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
         output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
-        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
+        all_L = blocks.add(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
 
     output_scale = output_scale.cpu().numpy()
     all_L = all_L.reshape(-1, 4, 32)
@@ -682,7 +681,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
     if scale is not None:
         qdq_scale = scale.reshape(-1, QK_K // 16).to(torch.float32)
         dq_scale = d_scale.reshape(-1, 1).to(torch.float32)
-        all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
+        all_L = blocks.mul(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
         q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32)
     elif original:
         scales, _ = make_q3_quants(blocks, bits=3, do_rmse=True)
@@ -693,7 +692,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         qscale = (inverse_dq_scale * scales).round_().clamp_(-32, 31)
         qdq_scale = dq_scale.to(torch.float32) * qscale
         reverse_qdq_scale = get_reciprocal(qdq_scale)
-        all_L = blocks.mul_(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
+        all_L = blocks.mul(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
         q_scales_offset = (qdq_scale * inverse_dq_scale).round_().clamp_(-32, 31).add_(32)
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
@@ -705,7 +704,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32)
         dq_scale = d_scale.reshape(-1, 1).to(torch.float32)
-        all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
+        all_L = blocks.mul(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
 
         q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32)
 
@@ -743,7 +742,7 @@ def q4_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add_(mins.unsqueeze(-1))
+            blocks.add(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
             .clamp_(0, 15)
@@ -789,7 +788,7 @@ def q4_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add_(mins.unsqueeze(-1))
+            blocks.add(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
             .clamp_(0, 15)
@@ -839,7 +838,7 @@ def q5_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add_(mins.unsqueeze(-1))
+            blocks.add(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
             .clamp_(0, 31)
@@ -884,7 +883,7 @@ def q5_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add_(mins.unsqueeze(-1))
+            blocks.add(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
             .clamp_(0, 31)
@@ -939,7 +938,7 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         d_tmp = output_d * output_scale.to(torch.float32)
         replace_ids = d_tmp != 0
         all_L[replace_ids] = (
-            blocks[replace_ids].div_(d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8)
+            blocks[replace_ids].div(d_tmp[replace_ids]).reshape(-1, 1).add_(32).round_().clamp_(0, 63).to(torch.uint8)
         )
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
@@ -952,7 +951,7 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         scales = scales.reshape((-1, QK_K // 16))
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(max=127).to(torch.int8)
-        all_L = blocks.mul_(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8)
+        all_L = blocks.mul(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8)
 
     tmp_L = all_L.reshape(nb, 4, 64) & 0xF
     output_ql = (tmp_L[:, ::2] | (tmp_L[:, 1::2] << 4)).reshape(nb, QK_K // 2).cpu().numpy().astype(np.uint8)

From 575103fc8c6fa615c3a0c0de7422a570374458cb Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 20 Nov 2025 21:20:48 +0800
Subject: [PATCH 43/57] try to fix ut failure

---
 auto_round/compressors/base.py              |   5 +-
 auto_round/export/export_to_gguf/packing.py | 157 ++++++++++----------
 2 files changed, 84 insertions(+), 78 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 719b45d36..2ccb22486 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -714,9 +714,8 @@ def _check_compatibility(self) -> None:
             if has_gguf and self.iters != 0 and self.bits != 3 and not self.enable_alg_ext:
                 logger.warning(
                     "`iters=0` is recommended when exporting to GGUF format except for bits 3,"
-                    " as we have optimized the RTN method for this case."
-                    " Or add enable_alg_ext to use the new algorithm,"
-                    " refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md"
+                    " or add `enable_alg_ext` for better accuracy with much more tuning cost"
+                    " Please refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md"
                     " to check the acc."
                 )
 
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 869bc1eb8..9158f3742 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -29,73 +29,80 @@ def register(cls):
     return register
 
 
+def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original):
+    try:
+        new_data = quant_func(
+            blocks,
+            scale,
+            zp=zp,
+            wmin=wmin,
+            d_scale=d_scale,
+            d_wmin=d_wmin,
+            imatrix=imatrix,
+            original=original,
+        )
+    except torch.OutOfMemoryError:
+        device = "cpu"
+        blocks = blocks.to(device)
+        scale = scale.to(device) if scale is not None else scale
+        zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp
+        wmin = wmin.to(device) if wmin is not None else wmin
+        d_scale = d_scale.to(device) if d_scale is not None else d_scale
+        d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
+        imatrix = imatrix.to(device) if imatrix is not None else imatrix
+        clear_memory(device_list=orig_device)
+        new_data = quant_func(
+            blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
+        )
+    return new_data
+
+
 def ggml_quant(
-    data,
-    ggml_type,
-    scale=None,
-    zp=None,
-    wmin=None,
-    d_scale=None,
-    d_wmin=None,
-    imatrix=None,
-    device="cuda",
-    original=False,
+        data,
+        ggml_type,
+        scale=None,
+        zp=None,
+        wmin=None,
+        d_scale=None,
+        d_wmin=None,
+        imatrix=None,
+        device="cuda",
+        original=False,
 ):
     block_size, type_size = GGML_QUANT_SIZES[ggml_type]
-
     data = data.to(torch.float32).to(device)
-    scale = scale.to(device) if scale is not None else scale
-    zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp
-    wmin = wmin.to(device) if wmin is not None else wmin
-    d_scale = d_scale.to(device) if d_scale is not None else d_scale
-    d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
-
     shape = data.shape
     n_blocks = data.nelement() // block_size
     split_num = 16 if max(data.shape) > 100_000 else 1
     blocks = data.reshape((n_blocks, block_size))
+    scale = scale.to(device).reshape(blocks.shape[0],-1) if scale is not None else scale
+    zp = zp.to(device).reshape(blocks.shape[0],-1)  if zp is not None and isinstance(zp, torch.Tensor) else zp
+    wmin = wmin.to(device).reshape(blocks.shape[0],-1)  if wmin is not None else wmin
+    d_scale = d_scale.to(device).reshape(blocks.shape[0],-1)  if d_scale is not None else d_scale
+    d_wmin = d_wmin.to(device).reshape(blocks.shape[0],-1)  if d_wmin is not None else d_wmin
+
     quant_func = GGML_QUANT_TYPE[ggml_type]
     results = []
-    orig_device = blocks.device
+    chunk_size = (n_blocks + split_num - 1) // split_num
     for i in range(split_num):
         if split_num > 1:
-            start = (n_blocks * i) // split_num
-            end = (n_blocks * (i + 1)) // split_num
-            blocks = data.reshape((n_blocks, block_size))[start:end]
-            scale = scale[start:end] if scale is not None else scale
-            zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp
-            wmin = wmin[start:end] if wmin is not None else wmin
-            d_scale = d_scale[start:end] if d_scale is not None else d_scale
-            d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
+            start = chunk_size * i
+            end = chunk_size * (i + 1)
+            tmp_blocks = blocks[start:end]
+            tmp_scale = scale[start:end] if scale is not None else scale
+            tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp
+            tmp_wmin = wmin[start:end] if wmin is not None else wmin
+            tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale
+            tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
             shape = data.shape
+            new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin,
+                                       imatrix, original)
             # imatrix = imatrix[start:end] if imatrix is not None else imatrix
-        try:
-            new_data = quant_func(
-                blocks,
-                scale,
-                zp=zp,
-                wmin=wmin,
-                d_scale=d_scale,
-                d_wmin=d_wmin,
-                imatrix=imatrix,
-                original=original,
-            )
-        except torch.OutOfMemoryError:
-            device = "cpu"
-            blocks = blocks.to(device)
-            scale = scale.to(device) if scale is not None else scale
-            zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp
-            wmin = wmin.to(device) if wmin is not None else wmin
-            d_scale = d_scale.to(device) if d_scale is not None else d_scale
-            d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
-            imatrix = imatrix.to(device) if imatrix is not None else imatrix
-            clear_memory(device_list=orig_device)
-            new_data = quant_func(
-                blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
-            )
+    else:
+        new_data = ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original)
         results.append(new_data)
         if split_num > 1:
-            clear_memory(device_list=orig_device)
+            clear_memory(device_list=device)
 
     if len(results) == 1:
         new_data = results[0]
@@ -509,7 +516,7 @@ def q4_1_quant_block(blocks, scale=None, zp=None, **kwargs):
     id = get_reciprocal(d)
     n_blocks = blocks.shape[0]
 
-    qs = torch.trunc(blocks.sub(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8)
+    qs = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 15).to(torch.uint8)
 
     block_size = GGML_QUANT_SIZES["q4_1"][0]
     qs = qs.reshape((n_blocks, 2, block_size // 2)).cpu().numpy()
@@ -566,7 +573,7 @@ def q5_1_quant_block(blocks: np.array, scale=None, zp=None, **kwargs):
     block_size = GGML_QUANT_SIZES["q5_1"][0]
 
     id = get_reciprocal(d)
-    q = torch.trunc(blocks.sub(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
+    q = torch.trunc(blocks.sub_(min).mul_(id).add_(0.5)).clamp_(0, 31).to(torch.uint8).cpu().numpy()
 
     qs = q.reshape((n_blocks, 2, block_size // 2))
     qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -610,7 +617,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
         output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
-        all_L = blocks.add(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
+        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
     elif original:
         scales, all_L, mins = make_qkx2_quants(blocks, bits=2, rmin=-0.5, rdelta=0.1, nstep=15, use_mad=True)
         max_scales = torch.max(scales, dim=-1, keepdim=True)[0]
@@ -628,7 +635,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
 
         replace_ids = (max_mins > 0).squeeze()
         output_scale[replace_ids] |= (
-            torch.round(id_mins[replace_ids] * mins[replace_ids]).clip(0, 15).to(torch.uint8) << 4
+                torch.round(id_mins[replace_ids] * mins[replace_ids]).clip(0, 15).to(torch.uint8) << 4
         )
 
         d_tmp = output_d * (output_scale & 0xF)
@@ -658,7 +665,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
         output_dmin = d_wmin.reshape(-1, 1).to(torch.float32)
         output_scale = scales.mul(get_reciprocal(output_d)).round_().clamp_(0, 15).to(torch.uint8)
         output_scale |= (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 15).to(torch.uint8) << 4
-        all_L = blocks.add(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
+        all_L = blocks.add_(mins.unsqueeze(-1)).div_(scales.unsqueeze(-1)).round_().clamp_(0, 3).to(torch.uint8)
 
     output_scale = output_scale.cpu().numpy()
     all_L = all_L.reshape(-1, 4, 32)
@@ -692,7 +699,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         qscale = (inverse_dq_scale * scales).round_().clamp_(-32, 31)
         qdq_scale = dq_scale.to(torch.float32) * qscale
         reverse_qdq_scale = get_reciprocal(qdq_scale)
-        all_L = blocks.mul(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
+        all_L = blocks.mul_(reverse_qdq_scale.unsqueeze(-1)).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
         q_scales_offset = (qdq_scale * inverse_dq_scale).round_().clamp_(-32, 31).add_(32)
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_sym_dq
@@ -704,7 +711,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         blocks = blocks.reshape((nb, QK_K // 16, 16))
         qdq_scale = scales.reshape((-1, QK_K // 16)).to(torch.float32)
         dq_scale = d_scale.reshape(-1, 1).to(torch.float32)
-        all_L = blocks.mul(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
+        all_L = blocks.mul_(get_reciprocal(qdq_scale.unsqueeze(-1))).round_().clamp_(-4, 3).add_(4).to(torch.uint8)
 
         q_scales_offset = (qdq_scale * get_reciprocal(dq_scale)).round_().clamp_(-32, 31).add_(32)
 
@@ -729,7 +736,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
 
 @register_qtype("q4_k")
 def q4_k_quant_block(
-    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs
+        blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
@@ -742,7 +749,7 @@ def q4_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add(mins.unsqueeze(-1))
+            blocks.add_(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
             .clamp_(0, 15)
@@ -788,7 +795,7 @@ def q4_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add(mins.unsqueeze(-1))
+            blocks.add_(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
             .clamp_(0, 15)
@@ -817,15 +824,15 @@ def q4_k_quant_block(
 
 @register_qtype("q5_k")
 def q5_k_quant_block(
-    blocks,
-    scale=None,
-    zp=None,
-    wmin=None,
-    d_scale=None,
-    d_wmin=None,
-    imatrix=None,
-    original=False,
-    **kwargs,
+        blocks,
+        scale=None,
+        zp=None,
+        wmin=None,
+        d_scale=None,
+        d_wmin=None,
+        imatrix=None,
+        original=False,
+        **kwargs,
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
@@ -838,7 +845,7 @@ def q5_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add(mins.unsqueeze(-1))
+            blocks.add_(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
             .clamp_(0, 31)
@@ -860,7 +867,7 @@ def q5_k_quant_block(
         dm_tmp = output_dmin * q_mins
         replace_ids = d_tmp != 0
         all_L[replace_ids] = (
-            blocks[replace_ids]
+            blocks_[replace_ids]
             .add_(dm_tmp[replace_ids].unsqueeze(-1))
             .div_(d_tmp[replace_ids].unsqueeze(-1))
             .round_()
@@ -883,7 +890,7 @@ def q5_k_quant_block(
         q_scales = (scales * get_reciprocal(output_d)).round_().clamp_(0, 63).to(torch.uint8)
         q_mins = (mins * get_reciprocal(output_dmin)).round_().clamp_(0, 63).to(torch.uint8)
         all_L = (
-            blocks.add(mins.unsqueeze(-1))
+            blocks.add_(mins.unsqueeze(-1))
             .mul_(get_reciprocal(scales.unsqueeze(-1)))
             .round_()
             .clamp_(0, 31)
@@ -926,7 +933,7 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         rd = get_reciprocal(output_d)
         output_scale = scales.mul(rd).round_().clamp_(max=127).to(torch.int8)
         rs = get_reciprocal(scales).unsqueeze_(-1)  # unsqueeze for broadcasting
-        all_L = blocks.mul(rs).add_(32).round_().clamp_(0, 63).to(torch.uint8)
+        all_L = blocks.mul_(rs).add_(32).round_().clamp_(0, 63).to(torch.uint8)
     elif original:
         scales, all_L = make_qx_quants(blocks, bits=6, rmse_type=1, qw=None)
         imax = abs(scales).argmax(dim=-1, keepdim=True)
@@ -951,7 +958,7 @@ def q6_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
         scales = scales.reshape((-1, QK_K // 16))
         output_d = d_scale.reshape(-1, 1).to(torch.float32)
         output_scale = (scales * get_reciprocal(output_d)).round_().clamp_(max=127).to(torch.int8)
-        all_L = blocks.mul(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8)
+        all_L = blocks.mul_(get_reciprocal(scales.unsqueeze(-1))).add_(32).round_().clamp_(0, 63).to(torch.uint8)
 
     tmp_L = all_L.reshape(nb, 4, 64) & 0xF
     output_ql = (tmp_L[:, ::2] | (tmp_L[:, 1::2] << 4)).reshape(nb, QK_K // 2).cpu().numpy().astype(np.uint8)

From 55efbbca4ecba9a97274881780846436f744b417 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 20 Nov 2025 13:25:42 +0000
Subject: [PATCH 44/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_gguf/packing.py | 57 +++++++++++----------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 9158f3742..9d431898d 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -58,16 +58,16 @@ def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatri
 
 
 def ggml_quant(
-        data,
-        ggml_type,
-        scale=None,
-        zp=None,
-        wmin=None,
-        d_scale=None,
-        d_wmin=None,
-        imatrix=None,
-        device="cuda",
-        original=False,
+    data,
+    ggml_type,
+    scale=None,
+    zp=None,
+    wmin=None,
+    d_scale=None,
+    d_wmin=None,
+    imatrix=None,
+    device="cuda",
+    original=False,
 ):
     block_size, type_size = GGML_QUANT_SIZES[ggml_type]
     data = data.to(torch.float32).to(device)
@@ -75,11 +75,11 @@ def ggml_quant(
     n_blocks = data.nelement() // block_size
     split_num = 16 if max(data.shape) > 100_000 else 1
     blocks = data.reshape((n_blocks, block_size))
-    scale = scale.to(device).reshape(blocks.shape[0],-1) if scale is not None else scale
-    zp = zp.to(device).reshape(blocks.shape[0],-1)  if zp is not None and isinstance(zp, torch.Tensor) else zp
-    wmin = wmin.to(device).reshape(blocks.shape[0],-1)  if wmin is not None else wmin
-    d_scale = d_scale.to(device).reshape(blocks.shape[0],-1)  if d_scale is not None else d_scale
-    d_wmin = d_wmin.to(device).reshape(blocks.shape[0],-1)  if d_wmin is not None else d_wmin
+    scale = scale.to(device).reshape(blocks.shape[0], -1) if scale is not None else scale
+    zp = zp.to(device).reshape(blocks.shape[0], -1) if zp is not None and isinstance(zp, torch.Tensor) else zp
+    wmin = wmin.to(device).reshape(blocks.shape[0], -1) if wmin is not None else wmin
+    d_scale = d_scale.to(device).reshape(blocks.shape[0], -1) if d_scale is not None else d_scale
+    d_wmin = d_wmin.to(device).reshape(blocks.shape[0], -1) if d_wmin is not None else d_wmin
 
     quant_func = GGML_QUANT_TYPE[ggml_type]
     results = []
@@ -95,8 +95,9 @@ def ggml_quant(
             tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale
             tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
             shape = data.shape
-            new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin,
-                                       imatrix, original)
+            new_data = ggml_quant_core(
+                quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, imatrix, original
+            )
             # imatrix = imatrix[start:end] if imatrix is not None else imatrix
     else:
         new_data = ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original)
@@ -635,7 +636,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
 
         replace_ids = (max_mins > 0).squeeze()
         output_scale[replace_ids] |= (
-                torch.round(id_mins[replace_ids] * mins[replace_ids]).clip(0, 15).to(torch.uint8) << 4
+            torch.round(id_mins[replace_ids] * mins[replace_ids]).clip(0, 15).to(torch.uint8) << 4
         )
 
         d_tmp = output_d * (output_scale & 0xF)
@@ -736,7 +737,7 @@ def q3_k_quant_block(blocks: np.array, scale=None, d_scale=None, original=False,
 
 @register_qtype("q4_k")
 def q4_k_quant_block(
-        blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs
+    blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, imatrix=None, original=False, split_num=1, **kwargs
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))
@@ -824,15 +825,15 @@ def q4_k_quant_block(
 
 @register_qtype("q5_k")
 def q5_k_quant_block(
-        blocks,
-        scale=None,
-        zp=None,
-        wmin=None,
-        d_scale=None,
-        d_wmin=None,
-        imatrix=None,
-        original=False,
-        **kwargs,
+    blocks,
+    scale=None,
+    zp=None,
+    wmin=None,
+    d_scale=None,
+    d_wmin=None,
+    imatrix=None,
+    original=False,
+    **kwargs,
 ):
     nb = blocks.shape[0]
     blocks = blocks.reshape((nb, QK_K // 32, 32))

From 9fa4cd9920ccf57df740b4221ec3f574b4f337a4 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 20 Nov 2025 21:44:59 +0800
Subject: [PATCH 45/57] try to fix ut failure

---
 auto_round/compressors/base.py              |  4 +-
 auto_round/export/export_to_gguf/packing.py | 54 ++++++++++-----------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 2ccb22486..06b861ccf 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -713,8 +713,8 @@ def _check_compatibility(self) -> None:
                 raise ValueError("Gguf format is not compatible with other formats, please choose only one of them")
             if has_gguf and self.iters != 0 and self.bits != 3 and not self.enable_alg_ext:
                 logger.warning(
-                    "`iters=0` is recommended when exporting to GGUF format except for bits 3,"
-                    " or add `enable_alg_ext` for better accuracy with much more tuning cost"
+                    "`iters=0` is recommended when exporting to current GGUF format"
+                    " or add `enable_alg_ext` for better accuracy with much more tuning cost."
                     " Please refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md"
                     " to check the acc."
                 )
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 9158f3742..5ca3c8d06 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -42,15 +42,15 @@ def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatri
             original=original,
         )
     except torch.OutOfMemoryError:
-        device = "cpu"
-        blocks = blocks.to(device)
-        scale = scale.to(device) if scale is not None else scale
-        zp = zp.to(device) if zp is not None and isinstance(zp, torch.Tensor) else zp
-        wmin = wmin.to(device) if wmin is not None else wmin
-        d_scale = d_scale.to(device) if d_scale is not None else d_scale
-        d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
-        imatrix = imatrix.to(device) if imatrix is not None else imatrix
-        clear_memory(device_list=orig_device)
+        cpu_device = "cpu"
+        blocks = blocks.to(cpu_device)
+        scale = scale.to(cpu_device) if scale is not None else scale
+        zp = zp.to(cpu_device) if zp is not None and isinstance(zp, torch.Tensor) else zp
+        wmin = wmin.to(cpu_device) if wmin is not None else wmin
+        d_scale = d_scale.to(cpu_device) if d_scale is not None else d_scale
+        d_wmin = d_wmin.to(cpu_device) if d_wmin is not None else d_wmin
+        imatrix = imatrix.to(cpu_device) if imatrix is not None else imatrix
+        clear_memory(device_list=device)
         new_data = quant_func(
             blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
         )
@@ -84,31 +84,31 @@ def ggml_quant(
     quant_func = GGML_QUANT_TYPE[ggml_type]
     results = []
     chunk_size = (n_blocks + split_num - 1) // split_num
-    for i in range(split_num):
-        if split_num > 1:
-            start = chunk_size * i
-            end = chunk_size * (i + 1)
-            tmp_blocks = blocks[start:end]
-            tmp_scale = scale[start:end] if scale is not None else scale
-            tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp
-            tmp_wmin = wmin[start:end] if wmin is not None else wmin
-            tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale
-            tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
-            shape = data.shape
-            new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin,
-                                       imatrix, original)
-            # imatrix = imatrix[start:end] if imatrix is not None else imatrix
+    if split_num > 1:
+        for i in range(split_num):
+                start = chunk_size * i
+                end = chunk_size * (i + 1)
+                tmp_blocks = blocks[start:end]
+                tmp_scale = scale[start:end] if scale is not None else scale
+                tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp
+                tmp_wmin = wmin[start:end] if wmin is not None else wmin
+                tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale
+                tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
+                new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin,
+                                           imatrix, original)
+                results.append(new_data)
+                if split_num > 1:
+                    clear_memory(device_list=device)
     else:
         new_data = ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original)
         results.append(new_data)
-        if split_num > 1:
-            clear_memory(device_list=device)
+
 
     if len(results) == 1:
         new_data = results[0]
     else:
         new_data = np.concatenate(results, axis=0)
-    new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)
+    new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) # Check shape correctness
     new_data = new_data.reshape(*shape[:-1], -1)
     return new_data
 
@@ -867,7 +867,7 @@ def q5_k_quant_block(
         dm_tmp = output_dmin * q_mins
         replace_ids = d_tmp != 0
         all_L[replace_ids] = (
-            blocks_[replace_ids]
+            blocks[replace_ids]
             .add_(dm_tmp[replace_ids].unsqueeze(-1))
             .div_(d_tmp[replace_ids].unsqueeze(-1))
             .round_()

From 70a3fdb8927e86f6296579130dc4bf51ce4d9dff Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 20 Nov 2025 13:52:15 +0000
Subject: [PATCH 46/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_gguf/packing.py | 30 ++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 93a87fb04..729174a1e 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -86,29 +86,29 @@ def ggml_quant(
     chunk_size = (n_blocks + split_num - 1) // split_num
     if split_num > 1:
         for i in range(split_num):
-                start = chunk_size * i
-                end = chunk_size * (i + 1)
-                tmp_blocks = blocks[start:end]
-                tmp_scale = scale[start:end] if scale is not None else scale
-                tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp
-                tmp_wmin = wmin[start:end] if wmin is not None else wmin
-                tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale
-                tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
-                new_data = ggml_quant_core(quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin,
-                                           imatrix, original)
-                results.append(new_data)
-                if split_num > 1:
-                    clear_memory(device_list=device)
+            start = chunk_size * i
+            end = chunk_size * (i + 1)
+            tmp_blocks = blocks[start:end]
+            tmp_scale = scale[start:end] if scale is not None else scale
+            tmp_zp = zp[start:end] if zp is not None and isinstance(zp, torch.Tensor) else zp
+            tmp_wmin = wmin[start:end] if wmin is not None else wmin
+            tmp_d_scale = d_scale[start:end] if d_scale is not None else d_scale
+            tmp_d_wmin = d_wmin[start:end] if d_wmin is not None else d_wmin
+            new_data = ggml_quant_core(
+                quant_func, tmp_blocks, tmp_scale, tmp_zp, tmp_wmin, tmp_d_scale, tmp_d_wmin, imatrix, original
+            )
+            results.append(new_data)
+            if split_num > 1:
+                clear_memory(device_list=device)
     else:
         new_data = ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatrix, original)
         results.append(new_data)
 
-
     if len(results) == 1:
         new_data = results[0]
     else:
         new_data = np.concatenate(results, axis=0)
-    new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) # Check shape correctness
+    new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)  # Check shape correctness
     new_data = new_data.reshape(*shape[:-1], -1)
     return new_data
 

From b035b4f7ecbc15f343290365805e9ac07900f25e Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 20 Nov 2025 22:07:17 +0800
Subject: [PATCH 47/57] try to fix ut failure

---
 auto_round/export/export_to_gguf/packing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 729174a1e..aa9e6ba27 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -42,6 +42,7 @@ def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatri
             original=original,
         )
     except torch.OutOfMemoryError:
+        orig_device = blocks.device
         cpu_device = "cpu"
         blocks = blocks.to(cpu_device)
         scale = scale.to(cpu_device) if scale is not None else scale
@@ -50,7 +51,7 @@ def ggml_quant_core(quant_func, blocks, scale, zp, wmin, d_scale, d_wmin, imatri
         d_scale = d_scale.to(cpu_device) if d_scale is not None else d_scale
         d_wmin = d_wmin.to(cpu_device) if d_wmin is not None else d_wmin
         imatrix = imatrix.to(cpu_device) if imatrix is not None else imatrix
-        clear_memory(device_list=device)
+        clear_memory(device_list=orig_device)
         new_data = quant_func(
             blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
         )

From a7cd959f7c3aa5f5c7e77dc88efef00d88c1af02 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 21 Nov 2025 11:54:43 +0800
Subject: [PATCH 48/57] update

---
 auto_round/compressors/base.py              | 24 ++++++++++------
 auto_round/export/export_to_gguf/export.py  | 32 +++++++++++++++++++--
 auto_round/export/export_to_gguf/packing.py |  1 -
 3 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 06b861ccf..564f219f6 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1229,7 +1229,7 @@ def get_imatrix_hook(module, input, output):
             for hook in hooks:
                 hook.remove()
 
-    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None:
+    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None:
         """Quantizes a layer using RTN (Round-To-Nearest) if available.
 
         This function attempts to quantize a layer by switching its data type to a
@@ -1252,14 +1252,14 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None:
         if is_fp8_linear(m):
             m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device)
             set_module(self.model, name, m)
-
+        tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device
         # Step 1: Try quantization on GPU first, fall back to CPU if OOM
         if self.immediate_packing and self.iters == 0 and "gguf" in self.formats[0] and not self.disable_opt_rtn:
+            m = m.to(tuning_device)
             m.scale = None
             m.zp = None
         else:
             try:
-                tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device
                 m = m.to(tuning_device)
                 m = WrapperLinear(
                     m,
@@ -1271,7 +1271,6 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None:
                     disable_opt_rtn=self.disable_opt_rtn,
                 )
                 m = m.unwrapper({})
-                m.to("cpu")
             except torch.OutOfMemoryError:
                 cuda_error_msg = traceback.format_exc()
                 m = m.orig_layer if hasattr(m, "orig_layer") else m
@@ -1291,11 +1290,14 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None:
                     raise
 
         # Step 2: Optional immediate packing/export
-        if self.immediate_packing:
+        if self.immediate_packing: # For gguf, packing conducts on block level
             self._immediate_pack(name)
+            if to_cpu:
+                m = m.to("cpu")
         else:
+            if to_cpu:
+                m = m.to("cpu")
             set_module(self.model, name, m)
-
         if self.immediate_saving:
             all_to_quantized_module_names = [n for n, m in self.model.named_modules() if check_to_quantized(m)]
             last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1])
@@ -1303,6 +1305,8 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None) -> None:
             immediate_saving(self, m, name, last_module)
 
     def _immediate_pack(self, name: str):
+        if not self.immediate_packing:
+            return
         m = get_module(self.model, name)
         if not check_to_quantized(m):
             return
@@ -1363,7 +1367,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None
         )
 
-        # self._quantize_embedding_layer()
+        self._quantize_embedding_layer() # levea to gguf itself to handle
 
         self.model.to("cpu")
         # Release memory
@@ -1515,14 +1519,16 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                     set_amax_for_all_moe_layers(block, attr_name="act_max")
                 # Normalize imatrix and quantize layers
                 if self.low_gpu_mem_usage:
+                    block.to("cpu")
                     clear_memory(device_list=self.device_list)
+
                 for _, m in block.named_modules():
                     # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu
                     # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1
                     if hasattr(m, "imatrix"):
                         m.imatrix /= m.imatrix_cnt
                     if hasattr(m, "tmp_name") and m.tmp_name in all_to_quantized_module_names:
-                        self._quantize_layer_via_rtn(m.tmp_name)
+                        self._quantize_layer_via_rtn(m.tmp_name,to_cpu=False)
                         all_to_quantized_module_names.remove(m.tmp_name)
                 if not self.immediate_saving:
                     mv_module_from_gpu(block)
@@ -1641,7 +1647,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         else:
             logger.info("start to cache block inputs")
         all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names)
-        is_quantized_embedding = self._quantize_embedding_layer()
+        is_quantized_embedding,_ = self._quantize_embedding_layer()
         clear_memory(device_list=self.device_list)
         all_q_inputs = None
         if is_quantized_embedding:
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
index 140776087..48b103a4e 100644
--- a/auto_round/export/export_to_gguf/export.py
+++ b/auto_round/export/export_to_gguf/export.py
@@ -159,13 +159,16 @@ def pack_gguf_layer(
                     model_type=convert_hf_to_gguf.ModelType.MMPROJ,
                 )
             )
+
         if not hasattr(model, "last_layer_name_to_block_name"):
             block_name_to_last_layer_name = {}
             block_names = get_block_names(model, quant_vision=True)
             block_names_flatten = flatten_list(block_names)
+            all_qlayer_name = []
             for n, m in model.named_modules():
                 if not check_to_quantized(m):
                     continue
+                all_qlayer_name.append(n)
                 for block_name in block_names_flatten:
                     block_name_split = block_name.split(".")
                     name_split = n.split(".")
@@ -177,13 +180,23 @@ def pack_gguf_layer(
                     block_name_to_last_layer_name[block_name] = n
             last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()}
             model.last_layer_name_to_block_name = last_layer_name_to_block_name
+            names_in_blocks=[]
+            for block_name in block_names_flatten:
+                block = get_module(model, block_name)
+                for n,m in block.named_modules():
+                    if check_to_quantized(m):
+                        names_in_blocks.append(m.tmp_name)
+            names_outside_blocks = list(set(layer_config.keys()) - set(names_in_blocks))
+            model.names_outside_blocks = names_outside_blocks
+
     if name in model.last_layer_name_to_block_name:
         # Packing block
+        block = get_module(model, model.last_layer_name_to_block_name[name])
         for gguf_model in gguf_model_instance_global:
             gguf_model.current_packing_block = model.last_layer_name_to_block_name[name]
             gguf_model.prepare_tensors()
 
-        block = get_module(model, model.last_layer_name_to_block_name[name])
+
         for n, m in block.named_modules():
             if hasattr(m, "weight"):
                 m.weight = None
@@ -193,6 +206,21 @@ def pack_gguf_layer(
         if len(model.last_layer_name_to_block_name) == 0:
             for gguf_model in gguf_model_instance_global:
                 gguf_model.current_packing_block = None
+    if name in model.names_outside_blocks:
+        # Packing block
+        for gguf_model in gguf_model_instance_global:
+            gguf_model.current_packing_block =name
+            gguf_model.prepare_tensors()
+
+        layer = get_module(model, name)
+        if hasattr(layer, "weight"):
+            layer.weight = None
+        if hasattr(layer, "bias"):
+            layer.bias = None
+        model.names_outside_blocks.remove(name)
+        if len(model.names_outside_blocks) == 0:
+            for gguf_model in gguf_model_instance_global:
+                gguf_model.current_packing_block = None
 
 
 @torch.inference_mode()
@@ -219,4 +247,4 @@ def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", layer_config=None, v
         logger.info(f"Model successfully exported to {gguf_model.fname_out}, running time={rt}")
     del gguf_model_instance_global
 
-    return model
+    return model
\ No newline at end of file
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index aa9e6ba27..9a48cfa0d 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -81,7 +81,6 @@ def ggml_quant(
     wmin = wmin.to(device).reshape(blocks.shape[0], -1) if wmin is not None else wmin
     d_scale = d_scale.to(device).reshape(blocks.shape[0], -1) if d_scale is not None else d_scale
     d_wmin = d_wmin.to(device).reshape(blocks.shape[0], -1) if d_wmin is not None else d_wmin
-
     quant_func = GGML_QUANT_TYPE[ggml_type]
     results = []
     chunk_size = (n_blocks + split_num - 1) // split_num

From ae99930ccf1d98b5beb86acc573e0658ca076b22 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 21 Nov 2025 03:55:29 +0000
Subject: [PATCH 49/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py             | 8 ++++----
 auto_round/export/export_to_gguf/export.py | 9 ++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 564f219f6..a841e7ea2 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1290,7 +1290,7 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
                     raise
 
         # Step 2: Optional immediate packing/export
-        if self.immediate_packing: # For gguf, packing conducts on block level
+        if self.immediate_packing:  # For gguf, packing conducts on block level
             self._immediate_pack(name)
             if to_cpu:
                 m = m.to("cpu")
@@ -1367,7 +1367,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None
         )
 
-        self._quantize_embedding_layer() # levea to gguf itself to handle
+        self._quantize_embedding_layer()  # levea to gguf itself to handle
 
         self.model.to("cpu")
         # Release memory
@@ -1528,7 +1528,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                     if hasattr(m, "imatrix"):
                         m.imatrix /= m.imatrix_cnt
                     if hasattr(m, "tmp_name") and m.tmp_name in all_to_quantized_module_names:
-                        self._quantize_layer_via_rtn(m.tmp_name,to_cpu=False)
+                        self._quantize_layer_via_rtn(m.tmp_name, to_cpu=False)
                         all_to_quantized_module_names.remove(m.tmp_name)
                 if not self.immediate_saving:
                     mv_module_from_gpu(block)
@@ -1647,7 +1647,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         else:
             logger.info("start to cache block inputs")
         all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names)
-        is_quantized_embedding,_ = self._quantize_embedding_layer()
+        is_quantized_embedding, _ = self._quantize_embedding_layer()
         clear_memory(device_list=self.device_list)
         all_q_inputs = None
         if is_quantized_embedding:
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
index 48b103a4e..0a5bfc461 100644
--- a/auto_round/export/export_to_gguf/export.py
+++ b/auto_round/export/export_to_gguf/export.py
@@ -180,10 +180,10 @@ def pack_gguf_layer(
                     block_name_to_last_layer_name[block_name] = n
             last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()}
             model.last_layer_name_to_block_name = last_layer_name_to_block_name
-            names_in_blocks=[]
+            names_in_blocks = []
             for block_name in block_names_flatten:
                 block = get_module(model, block_name)
-                for n,m in block.named_modules():
+                for n, m in block.named_modules():
                     if check_to_quantized(m):
                         names_in_blocks.append(m.tmp_name)
             names_outside_blocks = list(set(layer_config.keys()) - set(names_in_blocks))
@@ -196,7 +196,6 @@ def pack_gguf_layer(
             gguf_model.current_packing_block = model.last_layer_name_to_block_name[name]
             gguf_model.prepare_tensors()
 
-
         for n, m in block.named_modules():
             if hasattr(m, "weight"):
                 m.weight = None
@@ -209,7 +208,7 @@ def pack_gguf_layer(
     if name in model.names_outside_blocks:
         # Packing block
         for gguf_model in gguf_model_instance_global:
-            gguf_model.current_packing_block =name
+            gguf_model.current_packing_block = name
             gguf_model.prepare_tensors()
 
         layer = get_module(model, name)
@@ -247,4 +246,4 @@ def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", layer_config=None, v
         logger.info(f"Model successfully exported to {gguf_model.fname_out}, running time={rt}")
     del gguf_model_instance_global
 
-    return model
\ No newline at end of file
+    return model

From 79b249029966a8c4ae4af74a05435fb8c1251369 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 21 Nov 2025 12:04:47 +0800
Subject: [PATCH 50/57] fix

---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 564f219f6..2ccb34427 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1367,7 +1367,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None
         )
 
-        self._quantize_embedding_layer() # levea to gguf itself to handle
+        # self._quantize_embedding_layer() # leave to gguf itself to handle
 
         self.model.to("cpu")
         # Release memory

From 29b51885bf815d343d92c2ef03694cde78e366aa Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 21 Nov 2025 12:08:24 +0800
Subject: [PATCH 51/57] update

Signed-off-by: Wenhua Cheng <wenhua.cheng@intel.com>
---
 auto_round/compressors/base.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 2ccb34427..54561e464 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1363,11 +1363,9 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             for module in tqdm(modules, desc="Update weight global scale for fuse module"):
                 update_fused_layer_global_scales(module)
 
-        has_gguf_k = (
-            any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None
-        )
 
-        # self._quantize_embedding_layer() # leave to gguf itself to handle
+        if not (any("gguf" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None):
+            self._quantize_embedding_layer() # leave to gguf itself to handle
 
         self.model.to("cpu")
         # Release memory
@@ -1375,6 +1373,10 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
 
         enable_imatrix = False
         if not self.disable_opt_rtn:
+            has_gguf_k = (
+                    any("gguf" in fmt and "k" in fmt for fmt in
+                        getattr(self, "formats", [])) or self.super_bits is not None
+            )
             if has_gguf_k:
                 enable_imatrix = True
             elif self.data_type == "int" and self.sym:

From 5e0bb6c590576ef9c78be8103b1ca8fddea59609 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 21 Nov 2025 04:10:42 +0000
Subject: [PATCH 52/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 63c36b726..b0dbcc381 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1364,7 +1364,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
                 update_fused_layer_global_scales(module)
 
         if not (any("gguf" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None):
-            self._quantize_embedding_layer() # leave to gguf itself to handle
+            self._quantize_embedding_layer()  # leave to gguf itself to handle
 
         self.model.to("cpu")
         # Release memory
@@ -1373,8 +1373,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         enable_imatrix = False
         if not self.disable_opt_rtn:
             has_gguf_k = (
-                    any("gguf" in fmt and "k" in fmt for fmt in
-                        getattr(self, "formats", [])) or self.super_bits is not None
+                any("gguf" in fmt and "k" in fmt for fmt in getattr(self, "formats", [])) or self.super_bits is not None
             )
             if has_gguf_k:
                 enable_imatrix = True

From d6d2979589863b209df8ead4373c5fcc88ad3c15 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 21 Nov 2025 13:08:13 +0800
Subject: [PATCH 53/57] fix typo

---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 63c36b726..951ee9e66 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1648,7 +1648,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         else:
             logger.info("start to cache block inputs")
         all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names)
-        is_quantized_embedding, _ = self._quantize_embedding_layer()
+        is_quantized_embedding = self._quantize_embedding_layer()
         clear_memory(device_list=self.device_list)
         all_q_inputs = None
         if is_quantized_embedding:

From 1c8fe02d3910a55f7e364e7fab8695f84affd006 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Sun, 23 Nov 2025 22:27:49 -0500
Subject: [PATCH 54/57] fix bug of gguf mllm

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/export/export_to_gguf/convert.py | 16 +++----------
 auto_round/utils/model.py                   | 25 +++++++++++++++++++++
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
index 572e4bb6f..5a00c49ed 100644
--- a/auto_round/export/export_to_gguf/convert.py
+++ b/auto_round/export/export_to_gguf/convert.py
@@ -50,7 +50,7 @@
 
 from auto_round.export.export_to_gguf.config import ModelType
 from auto_round.export.export_to_gguf.packing import ggml_quant
-from auto_round.utils import LazyImport, clear_memory, get_module, get_packing_device, is_fp8_model, logger
+from auto_round.utils import LazyImport, clear_memory, get_module, get_packing_device, is_fp8_model, logger, clean_module_parameter
 
 gguf = LazyImport("gguf")
 
@@ -58,17 +58,6 @@
     from torch import Tensor
 
 
-def clean_module_parameter(submodule, parameter):
-    if submodule is None:
-        return
-    is_buffer = parameter in submodule._buffers
-    with torch.no_grad():
-        if is_buffer:
-            submodule._buffers[parameter] = None
-        else:
-            submodule._parameters[parameter] = None
-
-
 def download_convert_file(redownload=False):
     CONVERT_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/convert_hf_to_gguf.py"
     FILE_NAME = "convert_hf_to_gguf.py"
@@ -375,7 +364,8 @@ def prepare_tensors(cls):
     max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,")
 
     for name, data_torch in chain(cls.generate_extra_tensors(), cls.get_tensors()):
-        if data_torch is None:
+        
+        if data_torch is None or data_torch.numel() == 0:
             continue
         # we don't need these
         if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index ff9b3b57c..34744274d 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -28,6 +28,31 @@
 from auto_round.logger import logger
 from auto_round.schemes import QuantizationScheme
 
+def clean_module_parameter(submodule: torch.nn.Module, param_name: str) -> None:
+    """This function is recommended to be used instead of module.weight = None.
+    For models like `tie_word_embeddings`, setting the embedding weight to None 
+    causes `lm_head` to reallocate memory for its weight instead of treating it as a "bound shared weight," 
+    it's now iterated over as an independent parameter, 
+    resulting in an additional `lm_head` parameter in `named_parameters`.
+
+    Args:
+        submodule (torch.nn.Module): submodule to clean
+        param_name (str): "weight" or "bias"
+    """
+    if submodule is None:
+        return
+    is_buffer = param_name in submodule._buffers
+    with torch.no_grad():
+        if is_buffer:
+            buf = submodule._buffers[param_name]
+            if buf is not None:
+                buf.data = torch.empty(0, dtype=buf.dtype, device=buf.device)
+                buf.requires_grad = False
+        else:
+            param = submodule._parameters[param_name]
+            if param is not None:
+                param.data = torch.empty(0, dtype=param.dtype, device=param.device)
+                param.requires_grad = False
 
 def convert_dtype_str2torch(str_dtype):
     """Converts a string dtype to its corresponding PyTorch dtype.

From 0c254d7092d23779d1308f9b46bca98f2d06a958 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 24 Nov 2025 03:27:29 +0000
Subject: [PATCH 55/57] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_gguf/convert.py | 12 ++++++++++--
 auto_round/utils/model.py                   |  8 +++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
index 5a00c49ed..5f42535d3 100644
--- a/auto_round/export/export_to_gguf/convert.py
+++ b/auto_round/export/export_to_gguf/convert.py
@@ -50,7 +50,15 @@
 
 from auto_round.export.export_to_gguf.config import ModelType
 from auto_round.export.export_to_gguf.packing import ggml_quant
-from auto_round.utils import LazyImport, clear_memory, get_module, get_packing_device, is_fp8_model, logger, clean_module_parameter
+from auto_round.utils import (
+    LazyImport,
+    clean_module_parameter,
+    clear_memory,
+    get_module,
+    get_packing_device,
+    is_fp8_model,
+    logger,
+)
 
 gguf = LazyImport("gguf")
 
@@ -364,7 +372,7 @@ def prepare_tensors(cls):
     max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,")
 
     for name, data_torch in chain(cls.generate_extra_tensors(), cls.get_tensors()):
-        
+
         if data_torch is None or data_torch.numel() == 0:
             continue
         # we don't need these
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 34744274d..1c2fc7987 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -28,11 +28,12 @@
 from auto_round.logger import logger
 from auto_round.schemes import QuantizationScheme
 
+
 def clean_module_parameter(submodule: torch.nn.Module, param_name: str) -> None:
     """This function is recommended to be used instead of module.weight = None.
-    For models like `tie_word_embeddings`, setting the embedding weight to None 
-    causes `lm_head` to reallocate memory for its weight instead of treating it as a "bound shared weight," 
-    it's now iterated over as an independent parameter, 
+    For models like `tie_word_embeddings`, setting the embedding weight to None
+    causes `lm_head` to reallocate memory for its weight instead of treating it as a "bound shared weight,"
+    it's now iterated over as an independent parameter,
     resulting in an additional `lm_head` parameter in `named_parameters`.
 
     Args:
@@ -54,6 +55,7 @@ def clean_module_parameter(submodule: torch.nn.Module, param_name: str) -> None:
                 param.data = torch.empty(0, dtype=param.dtype, device=param.device)
                 param.requires_grad = False
 
+
 def convert_dtype_str2torch(str_dtype):
     """Converts a string dtype to its corresponding PyTorch dtype.
 

From f85fe7e18a7bca7c48a90c4af70bd2f9bc303c0e Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 24 Nov 2025 11:42:48 +0800
Subject: [PATCH 56/57] refine a little

---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 452b5a21c..62c2f6a32 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -716,7 +716,7 @@ def _check_compatibility(self) -> None:
                     "`iters=0` is recommended when exporting to current GGUF format"
                     " or add `enable_alg_ext` for better accuracy with much more tuning cost."
                     " Please refer to https://github.com/intel/auto-round/tree/main/docs/gguf_alg_ext_acc.md"
-                    " to check the acc."
+                    " for the accuracy results."
                 )
 
         if (

From 94085dc7b702403b9b25b455561a58e1044d3cfb Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 24 Nov 2025 13:27:36 +0800
Subject: [PATCH 57/57] refine a little

---
 auto_round/utils/device.py | 4 ++++
 docs/step_by_step.md       | 1 +
 2 files changed, 5 insertions(+)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 1db1fce69..12c904b3e 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -1383,8 +1383,12 @@ def update(self, device_list=None):
                 continue
             if torch.cuda.is_available():
                 current_vram = torch.cuda.memory_reserved(device) / 1024**3  # GB
+                if device == "cuda":
+                    device = "0"
             elif torch.xpu.is_available():
                 current_vram = torch.xpu.memory_reserved(device) / 1024**3  # GB
+                if device == "xpu":
+                    device = "0"
             else:
                 return
 
diff --git a/docs/step_by_step.md b/docs/step_by_step.md
index 29924bc79..e8c1dca8d 100644
--- a/docs/step_by_step.md
+++ b/docs/step_by_step.md
@@ -559,6 +559,7 @@ autoround.save_quantized(format="auto_awq", output_dir="tmp_autoround")
 
 
 - **Reduced CPU Memory Usage :**
+    - Enable low_cpu_mem_usage (experimental): Only one export format is supported. The quantized model is saved immediately after each block is packed, reducing peak CPU memory usage.
 
     - Trigger immediate packing: Packing will be triggered immediately when using the command-line interface or the
       quantize_and_save API, as long as only one export format is specified.