From a13bdf05f11ea6d64d8ca82fdca14bdfb6837902 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 13 Nov 2025 15:58:51 +0800
Subject: [PATCH 01/18] fix imatrix pad issue

---
 auto_round/data_type/int.py   | 2 +-
 auto_round/data_type/utils.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py
index 699466dc8..996b2589d 100644
--- a/auto_round/data_type/int.py
+++ b/auto_round/data_type/int.py
@@ -71,7 +71,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5
         imatrix = 1.0
     else:
         imatrix = imatrix.reshape(1, -1)
-
+        imatrix= reshape_pad_tensor_by_group_size(imatrix, group_size, val=1e-5)[0].view(1,-1)
         imatrix = imatrix.expand(tensor.numel() // imatrix.numel(), -1)
         imatrix = imatrix.reshape(tensor.shape)
 
diff --git a/auto_round/data_type/utils.py b/auto_round/data_type/utils.py
index 1bb53a14b..517f8342f 100644
--- a/auto_round/data_type/utils.py
+++ b/auto_round/data_type/utils.py
@@ -23,7 +23,7 @@
 from auto_round.utils import logger
 
 
-def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int):
+def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int, val:float=0.0):
     """Reshapes and pads the tensor to ensure that it can be quantized in groups of `group_size`.
 
     This function adjusts the
@@ -55,7 +55,7 @@ def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int):
         return data, orig_shape, pad_len
     else:
         pad_len = (data.shape[1] + group_size - 1) // group_size * group_size - data.shape[1]
-        data_new = torch.nn.functional.pad(data, (0, pad_len))
+        data_new = torch.nn.functional.pad(data, (val, pad_len))
         data_new = data_new.reshape(-1, group_size)
         return data_new, orig_shape, pad_len
 

From 4e201998387a382619141fe99abde8928863891b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 13 Nov 2025 08:01:11 +0000
Subject: [PATCH 02/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/data_type/int.py   | 2 +-
 auto_round/data_type/utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py
index 996b2589d..8fc6f79a0 100644
--- a/auto_round/data_type/int.py
+++ b/auto_round/data_type/int.py
@@ -71,7 +71,7 @@ def quant_tensor_rtn_sym(tensor, bits=4, group_size=-1, v=0, q_scale_thresh=1e-5
         imatrix = 1.0
     else:
         imatrix = imatrix.reshape(1, -1)
-        imatrix= reshape_pad_tensor_by_group_size(imatrix, group_size, val=1e-5)[0].view(1,-1)
+        imatrix = reshape_pad_tensor_by_group_size(imatrix, group_size, val=1e-5)[0].view(1, -1)
         imatrix = imatrix.expand(tensor.numel() // imatrix.numel(), -1)
         imatrix = imatrix.reshape(tensor.shape)
 
diff --git a/auto_round/data_type/utils.py b/auto_round/data_type/utils.py
index 517f8342f..4e458a669 100644
--- a/auto_round/data_type/utils.py
+++ b/auto_round/data_type/utils.py
@@ -23,7 +23,7 @@
 from auto_round.utils import logger
 
 
-def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int, val:float=0.0):
+def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int, val: float = 0.0):
     """Reshapes and pads the tensor to ensure that it can be quantized in groups of `group_size`.
 
     This function adjusts the

From 405bde72cc545fd6671e2fc7d08f9e70fd66377f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 13 Nov 2025 21:05:26 +0800
Subject: [PATCH 03/18] update

---
 auto_round/__main__.py                      |   7 ++
 auto_round/compressors/base.py              |   6 +-
 auto_round/data_type/gguf.py                | 124 +++++++++++---------
 auto_round/data_type/utils.py               |   2 +-
 auto_round/export/export_to_awq/utils.py    |   6 -
 auto_round/export/export_to_gguf/packing.py |   4 +-
 auto_round/utils/device.py                  |  16 +++
 7 files changed, 99 insertions(+), 66 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 76a8f73d1..1ddd07660 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -172,6 +172,12 @@ def __init__(self, *args, **kwargs):
             type=float,
             help="Learning rate specifically for min-max tuning. " "If None, uses the same value as --lr. ",
         )
+        tuning.add_argument(
+            "--momentum",
+            default=0,
+            type=float,
+            help="",
+        )
         tuning.add_argument(
             "--gradient_accumulate_steps",
             default=1,
@@ -591,6 +597,7 @@ def tune(args):
         extra_config=extra_config,
         layer_config=layer_config,
         model_dtype=args.model_dtype,
+        momentum=args.momentum,
     )
 
     model_name = args.model.rstrip("/")
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 2634769a1..2dc249517 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -152,6 +152,7 @@ def __init__(
         disable_opt_rtn: bool = False,
         seed: int = 42,
         low_cpu_mem_usage: bool = False,
+        momentum = 0.0,
         **kwargs,
     ):
         """Initialize AutoRound with quantization and tuning configuration.
@@ -250,6 +251,7 @@ def __init__(
         self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
         self.scale_dtype = convert_dtype_str2torch(scale_dtype)
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.momentum = momentum
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
@@ -2625,10 +2627,10 @@ def _quantize_block(
         minmax_lr = torch.tensor(self.minmax_lr)
         if self.enable_minmax_tuning:
             optimizer = self.optimizer(
-                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0
+                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0, momentum=self.momentum
             )
         else:
-            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0)
+            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0,momentum=self.momentum)
 
         if len(round_params) + len(minmax_params) <= 0:
             dump_info = (
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index f20c6c7a6..ab8cd01d2 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -17,6 +17,7 @@
 
 from auto_round.data_type.register import register_dtype
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste
+from auto_round.utils.device import clear_memory
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES
 from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants
 from auto_round.logger import logger
@@ -320,7 +321,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens
 
 
 @torch.no_grad()
-def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None):
+def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None,split_num=1):
     super_bits = 4 if bits == 2 else 6
     super_group_size = 16 if bits == 2 else 8
     group_size = 16 if bits == 2 else 32
@@ -348,6 +349,7 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri
             nstep=params["nstep"],
             use_mad=params["use_mad"],
             weights=quant_weights,
+            split_num=split_num
         )
         scale = scale.to(scale_dtype)
         scale = torch.where(torch.abs(scale) < 1e-30, torch.zeros_like(scale), scale)
@@ -446,10 +448,15 @@ def quant_tensor_gguf_asym_dq(
     orig_dtype = tensor.dtype
     maxq = 2**bits - 1
     group_size = 16 if bits == 2 else 32
+    if tensor.shape[-1] > 20000: # trick setting, for embedding and lm-head
+        split_num=16
+    else:
+        split_num=1
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
+
     tensor = tensor.to(torch.float32)
     if scale is None:
-        scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym(tensor, bits, scale_dtype, imatrix)
+        scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym(tensor, bits, scale_dtype, imatrix,split_num=split_num)
 
     inverse_scale = get_reciprocal(scale)
     int_w = torch.clamp(round_ste((tensor + wmin) * inverse_scale + v), 0, maxq)
@@ -458,7 +465,62 @@ def quant_tensor_gguf_asym_dq(
     return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
 
 
-def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None):
+def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8):
+    dtype = torch.float32
+    data = data.to(dtype)
+    maxq = 2**bits - 1
+    minq = 0
+    weights = 1.0 if weights is None else weights.to(dtype)
+
+    results_scale = []
+    results_rmin = []
+    chunk_size = (data.shape[0]+split_num-1)//split_num
+    for start in range(0, data.shape[0], chunk_size):
+        end = min(start + chunk_size, data.shape[0])
+        chunk = data[start:end]
+        chunk_weights = weights if isinstance(weights, float) else weights[start:end]
+
+        rmin = torch.min(chunk, dim=1, keepdim=True)[0]
+        rmax = torch.max(chunk, dim=1, keepdim=True)[0]
+        sum_w = torch.sum(chunk_weights, dim=1, keepdim=True)
+        sum_x = torch.sum(chunk_weights * chunk, dim=1, keepdim=True)
+        scale = (rmax - rmin) / (maxq - minq)
+        iscale = get_reciprocal(scale)
+        quant_data = torch.clamp(torch.round(iscale * (chunk - rmin)), minq, maxq)
+        diff = scale * quant_data + rmin - chunk
+        best_mad = torch.sum((chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=1, keepdim=True)
+
+        for is_ in range(nstep):
+            factor = rrmin + rdelta * is_ + maxq - minq
+            scale_new = (rmax - rmin) / factor
+            iscale_new = get_reciprocal(scale_new)
+            quant_data_new = torch.clamp(torch.round(iscale_new * (chunk - rmin)), minq, maxq)
+            mul_weights_quant_data = chunk_weights * quant_data_new
+            sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True)
+            sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True)
+            sum_xl = torch.sum(mul_weights_quant_data * chunk, dim=-1, keepdim=True)
+            D = sum_w * sum_l2 - torch.pow(sum_l, 2)
+            this_scale = (sum_w * sum_xl - sum_x * sum_l) / D
+            this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D
+            this_min[this_min > 0] = 0
+            this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0]
+            reverse_this_scale = get_reciprocal(this_scale)
+            quant_data = torch.clamp(torch.round(reverse_this_scale * (chunk - this_min)), minq, maxq)
+            diff = this_scale * quant_data + this_min - chunk
+            mad = torch.sum((chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=-1, keepdim=True)
+            idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0]
+            best_mad[idx_to_replace] = mad[idx_to_replace]
+            scale[idx_to_replace] = this_scale[idx_to_replace]
+            rmin[idx_to_replace] = this_min[idx_to_replace]
+        results_scale.append(scale.to(torch.float32))
+        results_rmin.append(-rmin.to(torch.float32))
+        if split_num>1:
+            clear_memory(device_list=[data.device])
+
+    return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0)
+
+
+def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None,split_num=1):
     """Adapted from Llamacpp. Performs iterative weighted least squares quantization search.
 
     Args:
@@ -473,57 +535,9 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
     Returns:
         Tuple: (Optimal scale tensor, optimal minimum value tensor)
     """
-    dtype = torch.float32
-    data = data.to(dtype)
-    maxq = 2**bits - 1
-    minq = 0
-    weights = 1.0 if weights is None else weights.to(dtype)
-
-    rmin = torch.min(data, dim=1, keepdim=True)[0]
-    rmax = torch.max(data, dim=1, keepdim=True)[0]
-
-    sum_w = torch.sum(weights, dim=1, keepdim=True)
-    sum_x = torch.sum(weights * data, dim=1, keepdim=True)
-
-    # scale = 1 / ((maxq - minq) / (rmax - rmin + 1e-8))
-    scale = (rmax - rmin) / (maxq - minq)
-    iscale = get_reciprocal(scale)
-    # quant_data = torch.clamp(torch.round((maxq - minq) / (rmax - rmin + 1e-8) * (data - rmin)), minq, maxq)
-    quant_data = torch.clamp(torch.round(iscale * (data - rmin)), minq, maxq)
-    diff = scale * quant_data + rmin - data
-
-    best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=1, keepdim=True)
-
-    for is_ in range(nstep):
-        factor = rrmin + rdelta * is_ + maxq - minq
-        # iscale_new = factor / (rmax - rmin + 1e-8)
-        scale_new = (rmax - rmin) / factor
-        iscale_new = get_reciprocal(scale_new)
-        quant_data_new = torch.clamp(torch.round(iscale_new * (data - rmin)), minq, maxq)
-
-        mul_weights_quant_data = weights * quant_data_new
-        sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True)
-        sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True)
-        sum_xl = torch.sum(mul_weights_quant_data * data, dim=-1, keepdim=True)
-
-        D = sum_w * sum_l2 - torch.pow(sum_l, 2)
-        this_scale = (sum_w * sum_xl - sum_x * sum_l) / D
-        this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D
-        this_min[this_min > 0] = 0
-        this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0]
-        reverse_this_scale = get_reciprocal(this_scale)
-
-        quant_data = torch.clamp(torch.round(reverse_this_scale * (data - this_min)), minq, maxq)
-        diff = this_scale * quant_data + this_min - data
-        # diff = this_scale * quant_data_new + this_min - data
-        mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=-1, keepdim=True)
-
-        idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0]
-        best_mad[idx_to_replace] = mad[idx_to_replace]
-        scale[idx_to_replace] = this_scale[idx_to_replace]
-        rmin[idx_to_replace] = this_min[idx_to_replace]
-
-    return scale.to(torch.float32), -rmin.to(torch.float32)
+    return iterative_wls_quant_search_chunk(data=data, bits=bits, rrmin=rrmin,
+                                            rdelta=rdelta, nstep=nstep, use_mad=use_mad,
+                                            weights=weights, split_num=split_num)
 
 
 @torch.no_grad()
@@ -550,7 +564,6 @@ def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype):
     return scale
 
 
-#
 @register_dtype("rtn_int_sym_dq")
 def quant_tensor_gguf_sym_dq(
     tensor,
@@ -566,7 +579,6 @@ def quant_tensor_gguf_sym_dq(
     Args:
         tensor: Tensor containing the tensor to be quantized
         bits: Number of bits for quantization (e.g., 2, 3, 4, 8)
-        group_size: Number of elements to share scale for quantization
         v: Rounding value perturbation
         min_scale: Minimum scale coefficient for tensor
         max_scale: Maximum scale coefficient for tensor
diff --git a/auto_round/data_type/utils.py b/auto_round/data_type/utils.py
index 517f8342f..ee834db20 100644
--- a/auto_round/data_type/utils.py
+++ b/auto_round/data_type/utils.py
@@ -55,7 +55,7 @@ def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int, val:fl
         return data, orig_shape, pad_len
     else:
         pad_len = (data.shape[1] + group_size - 1) // group_size * group_size - data.shape[1]
-        data_new = torch.nn.functional.pad(data, (val, pad_len))
+        data_new = torch.nn.functional.pad(data, (0, pad_len), value=val)
         data_new = data_new.reshape(-1, group_size)
         return data_new, orig_shape, pad_len
 
diff --git a/auto_round/export/export_to_awq/utils.py b/auto_round/export/export_to_awq/utils.py
index 0052ec9b1..4a6a48efe 100644
--- a/auto_round/export/export_to_awq/utils.py
+++ b/auto_round/export/export_to_awq/utils.py
@@ -317,9 +317,3 @@ def extra_repr(self) -> str:
             self.group_size,
         )
 
-
-def clear_memory(weight=None):
-    if weight is not None:
-        del weight
-    gc.collect()
-    torch.cuda.empty_cache()
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 4c64a75d5..05c15ef0a 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -16,7 +16,7 @@
 import torch
 
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, K_SCALE_SIZE, QK_K
-from auto_round.utils import get_reciprocal
+from auto_round.utils import get_reciprocal, clear_memory
 
 GGML_QUANT_TYPE = {}
 
@@ -59,6 +59,7 @@ def ggml_quant(
             blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
         )
     except Exception:
+        clear_memory()
         device = "cpu"
         blocks = blocks.to(device)
         scale = scale.to(device) if scale is not None else scale
@@ -66,6 +67,7 @@ def ggml_quant(
         wmin = wmin.to(device) if wmin is not None else wmin
         d_scale = d_scale.to(device) if d_scale is not None else d_scale
         d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
+        imatrix = imatrix.to(device) if imatrix is not None else imatrix
         new_data = quant_func(
             blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
         )
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 2f63a3a2d..38de9a96f 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -1308,3 +1308,19 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None
         return sorted(devices)
 
     raise TypeError(f"Unsupported device_map type: {type(device_map)}")
+
+
+def gpu_synchronize(devices):
+    def _gpu_synchronize(device):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize(device)
+        elif torch.xpu.is_available():
+            torch.xpu.synchronize(device)
+
+    if isinstance(devices,(list,tuple)):
+        for device in devices:
+            _gpu_synchronize(device)
+    else:
+        _gpu_synchronize(devices)
+
+

From 886a6c85f3592282f51ca0fd62fd0acf6772ab89 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 13 Nov 2025 13:06:41 +0000
Subject: [PATCH 04/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py              |  9 ++--
 auto_round/data_type/gguf.py                | 51 ++++++++++++++-------
 auto_round/export/export_to_awq/utils.py    |  1 -
 auto_round/export/export_to_gguf/packing.py |  2 +-
 auto_round/utils/device.py                  |  4 +-
 5 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 2dc249517..25f3825c1 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -152,7 +152,7 @@ def __init__(
         disable_opt_rtn: bool = False,
         seed: int = 42,
         low_cpu_mem_usage: bool = False,
-        momentum = 0.0,
+        momentum=0.0,
         **kwargs,
     ):
         """Initialize AutoRound with quantization and tuning configuration.
@@ -2627,10 +2627,13 @@ def _quantize_block(
         minmax_lr = torch.tensor(self.minmax_lr)
         if self.enable_minmax_tuning:
             optimizer = self.optimizer(
-                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0, momentum=self.momentum
+                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}],
+                lr=lr,
+                weight_decay=0,
+                momentum=self.momentum,
             )
         else:
-            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0,momentum=self.momentum)
+            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0, momentum=self.momentum)
 
         if len(round_params) + len(minmax_params) <= 0:
             dump_info = (
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index ab8cd01d2..ed05349b9 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -17,11 +17,11 @@
 
 from auto_round.data_type.register import register_dtype
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste
-from auto_round.utils.device import clear_memory
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES
 from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants
 from auto_round.logger import logger
 from auto_round.utils import get_reciprocal
+from auto_round.utils.device import clear_memory
 
 
 @register_dtype("int_sym_dq")
@@ -321,7 +321,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens
 
 
 @torch.no_grad()
-def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None,split_num=1):
+def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatrix=None, split_num=1):
     super_bits = 4 if bits == 2 else 6
     super_group_size = 16 if bits == 2 else 8
     group_size = 16 if bits == 2 else 32
@@ -349,7 +349,7 @@ def search_gguf_scale_min_asym(tensor, bits=4, scale_dtype=torch.float16, imatri
             nstep=params["nstep"],
             use_mad=params["use_mad"],
             weights=quant_weights,
-            split_num=split_num
+            split_num=split_num,
         )
         scale = scale.to(scale_dtype)
         scale = torch.where(torch.abs(scale) < 1e-30, torch.zeros_like(scale), scale)
@@ -448,15 +448,17 @@ def quant_tensor_gguf_asym_dq(
     orig_dtype = tensor.dtype
     maxq = 2**bits - 1
     group_size = 16 if bits == 2 else 32
-    if tensor.shape[-1] > 20000: # trick setting, for embedding and lm-head
-        split_num=16
+    if tensor.shape[-1] > 20000:  # trick setting, for embedding and lm-head
+        split_num = 16
     else:
-        split_num=1
+        split_num = 1
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
 
     tensor = tensor.to(torch.float32)
     if scale is None:
-        scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym(tensor, bits, scale_dtype, imatrix,split_num=split_num)
+        scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym(
+            tensor, bits, scale_dtype, imatrix, split_num=split_num
+        )
 
     inverse_scale = get_reciprocal(scale)
     int_w = torch.clamp(round_ste((tensor + wmin) * inverse_scale + v), 0, maxq)
@@ -465,7 +467,9 @@ def quant_tensor_gguf_asym_dq(
     return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
 
 
-def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8):
+def iterative_wls_quant_search_chunk(
+    data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8
+):
     dtype = torch.float32
     data = data.to(dtype)
     maxq = 2**bits - 1
@@ -474,7 +478,7 @@ def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep
 
     results_scale = []
     results_rmin = []
-    chunk_size = (data.shape[0]+split_num-1)//split_num
+    chunk_size = (data.shape[0] + split_num - 1) // split_num
     for start in range(0, data.shape[0], chunk_size):
         end = min(start + chunk_size, data.shape[0])
         chunk = data[start:end]
@@ -488,7 +492,9 @@ def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep
         iscale = get_reciprocal(scale)
         quant_data = torch.clamp(torch.round(iscale * (chunk - rmin)), minq, maxq)
         diff = scale * quant_data + rmin - chunk
-        best_mad = torch.sum((chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=1, keepdim=True)
+        best_mad = torch.sum(
+            (chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=1, keepdim=True
+        )
 
         for is_ in range(nstep):
             factor = rrmin + rdelta * is_ + maxq - minq
@@ -507,20 +513,26 @@ def iterative_wls_quant_search_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep
             reverse_this_scale = get_reciprocal(this_scale)
             quant_data = torch.clamp(torch.round(reverse_this_scale * (chunk - this_min)), minq, maxq)
             diff = this_scale * quant_data + this_min - chunk
-            mad = torch.sum((chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2), dim=-1, keepdim=True)
+            mad = torch.sum(
+                (chunk_weights * torch.abs(diff)) if use_mad else chunk_weights * torch.pow(diff, 2),
+                dim=-1,
+                keepdim=True,
+            )
             idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0]
             best_mad[idx_to_replace] = mad[idx_to_replace]
             scale[idx_to_replace] = this_scale[idx_to_replace]
             rmin[idx_to_replace] = this_min[idx_to_replace]
         results_scale.append(scale.to(torch.float32))
         results_rmin.append(-rmin.to(torch.float32))
-        if split_num>1:
+        if split_num > 1:
             clear_memory(device_list=[data.device])
 
     return torch.cat(results_scale, dim=0), torch.cat(results_rmin, dim=0)
 
 
-def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None,split_num=1):
+def iterative_wls_quant_search(
+    data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=1
+):
     """Adapted from Llamacpp. Performs iterative weighted least squares quantization search.
 
     Args:
@@ -535,9 +547,16 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
     Returns:
         Tuple: (Optimal scale tensor, optimal minimum value tensor)
     """
-    return iterative_wls_quant_search_chunk(data=data, bits=bits, rrmin=rrmin,
-                                            rdelta=rdelta, nstep=nstep, use_mad=use_mad,
-                                            weights=weights, split_num=split_num)
+    return iterative_wls_quant_search_chunk(
+        data=data,
+        bits=bits,
+        rrmin=rrmin,
+        rdelta=rdelta,
+        nstep=nstep,
+        use_mad=use_mad,
+        weights=weights,
+        split_num=split_num,
+    )
 
 
 @torch.no_grad()
diff --git a/auto_round/export/export_to_awq/utils.py b/auto_round/export/export_to_awq/utils.py
index 4a6a48efe..871e4287a 100644
--- a/auto_round/export/export_to_awq/utils.py
+++ b/auto_round/export/export_to_awq/utils.py
@@ -316,4 +316,3 @@ def extra_repr(self) -> str:
             self.w_bit,
             self.group_size,
         )
-
diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index 05c15ef0a..f54f78a02 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -16,7 +16,7 @@
 import torch
 
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, K_SCALE_SIZE, QK_K
-from auto_round.utils import get_reciprocal, clear_memory
+from auto_round.utils import clear_memory, get_reciprocal
 
 GGML_QUANT_TYPE = {}
 
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 38de9a96f..a281c4b46 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -1317,10 +1317,8 @@ def _gpu_synchronize(device):
         elif torch.xpu.is_available():
             torch.xpu.synchronize(device)
 
-    if isinstance(devices,(list,tuple)):
+    if isinstance(devices, (list, tuple)):
         for device in devices:
             _gpu_synchronize(device)
     else:
         _gpu_synchronize(devices)
-
-

From e2d7e704e501d5e5e8b910551dc4841f1540d914 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 13 Nov 2025 21:21:12 +0800
Subject: [PATCH 05/18] refine

---
 auto_round/compressors/base.py |  2 +-
 auto_round/data_type/gguf.py   | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 2dc249517..68dfaec89 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2630,7 +2630,7 @@ def _quantize_block(
                 [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0, momentum=self.momentum
             )
         else:
-            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0,momentum=self.momentum)
+            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0, momentum=self.momentum)
 
         if len(round_params) + len(minmax_params) <= 0:
             dump_info = (
diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index ab8cd01d2..c77c3381f 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -448,10 +448,11 @@ def quant_tensor_gguf_asym_dq(
     orig_dtype = tensor.dtype
     maxq = 2**bits - 1
     group_size = 16 if bits == 2 else 32
-    if tensor.shape[-1] > 20000: # trick setting, for embedding and lm-head
-        split_num=16
-    else:
-        split_num=1
+    split_num = 1
+    for dim in tensor.shape:
+        if dim > 100_000:
+            split_num = 16
+            break
     tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
 
     tensor = tensor.to(torch.float32)
@@ -535,6 +536,7 @@ def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, u
     Returns:
         Tuple: (Optimal scale tensor, optimal minimum value tensor)
     """
+    # TODO this one should change to try catch later
     return iterative_wls_quant_search_chunk(data=data, bits=bits, rrmin=rrmin,
                                             rdelta=rdelta, nstep=nstep, use_mad=use_mad,
                                             weights=weights, split_num=split_num)

From 21300752a15acc6cfc4a7403d5e2fe0fb18f474e Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 13 Nov 2025 21:24:15 +0800
Subject: [PATCH 06/18] clean

---
 auto_round/utils/device.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index a281c4b46..2f63a3a2d 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -1308,17 +1308,3 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None
         return sorted(devices)
 
     raise TypeError(f"Unsupported device_map type: {type(device_map)}")
-
-
-def gpu_synchronize(devices):
-    def _gpu_synchronize(device):
-        if torch.cuda.is_available():
-            torch.cuda.synchronize(device)
-        elif torch.xpu.is_available():
-            torch.xpu.synchronize(device)
-
-    if isinstance(devices, (list, tuple)):
-        for device in devices:
-            _gpu_synchronize(device)
-    else:
-        _gpu_synchronize(devices)

From 9ecf7e6273435c11a06a13fddacb15235dc1afbe Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 13 Nov 2025 21:28:36 +0800
Subject: [PATCH 07/18] update

---
 auto_round/data_type/gguf.py | 99 ++++++++++++++++++++++++++++++++----
 1 file changed, 89 insertions(+), 10 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index 913de1409..ecf60f280 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -460,7 +460,74 @@ def quant_tensor_gguf_asym_dq(
     qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
     return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
 
+def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None):
+    """Adapted from Llamacpp. Performs iterative weighted least squares quantization search.
 
+    Args:
+        data (torch.Tensor): Input tensor to quantize.
+        bits (int): Number of quantization bits.
+        rrmin (float): Initial range scaling factor.
+        rdelta (float): Step size for range scaling.
+        nstep (int): Number of search steps.
+        use_mad (bool): Whether to use mean absolute deviation instead of squared error.
+        weights (torch.Tensor): Weight matrix for each element.
+
+    Returns:
+        Tuple: (Optimal scale tensor, optimal minimum value tensor)
+    """
+    dtype = torch.float32
+    data = data.to(dtype)
+    maxq = 2**bits - 1
+    minq = 0
+    weights = 1.0 if weights is None else weights.to(dtype)
+
+    rmin = torch.min(data, dim=1, keepdim=True)[0]
+    rmax = torch.max(data, dim=1, keepdim=True)[0]
+
+    sum_w = torch.sum(weights, dim=1, keepdim=True)
+    sum_x = torch.sum(weights * data, dim=1, keepdim=True)
+
+    # scale = 1 / ((maxq - minq) / (rmax - rmin + 1e-8))
+    scale = (rmax - rmin) / (maxq - minq)
+    iscale = get_reciprocal(scale)
+    # quant_data = torch.clamp(torch.round((maxq - minq) / (rmax - rmin + 1e-8) * (data - rmin)), minq, maxq)
+    quant_data = torch.clamp(torch.round(iscale * (data - rmin)), minq, maxq)
+    diff = scale * quant_data + rmin - data
+
+    best_mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=1, keepdim=True)
+
+    for is_ in range(nstep):
+        factor = rrmin + rdelta * is_ + maxq - minq
+        # iscale_new = factor / (rmax - rmin + 1e-8)
+        scale_new = (rmax - rmin) / factor
+        iscale_new = get_reciprocal(scale_new)
+        quant_data_new = torch.clamp(torch.round(iscale_new * (data - rmin)), minq, maxq)
+
+        mul_weights_quant_data = weights * quant_data_new
+        sum_l = torch.sum(mul_weights_quant_data, dim=-1, keepdim=True)
+        sum_l2 = torch.sum(mul_weights_quant_data * quant_data_new, dim=-1, keepdim=True)
+        sum_xl = torch.sum(mul_weights_quant_data * data, dim=-1, keepdim=True)
+
+        D = sum_w * sum_l2 - torch.pow(sum_l, 2)
+        this_scale = (sum_w * sum_xl - sum_x * sum_l) / D
+        this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D
+        this_min[this_min > 0] = 0
+        this_scale[this_min > 0] = (sum_xl / sum_l2)[this_min > 0]
+        reverse_this_scale = get_reciprocal(this_scale)
+
+        quant_data = torch.clamp(torch.round(reverse_this_scale * (data - this_min)), minq, maxq)
+        diff = this_scale * quant_data + this_min - data
+        # diff = this_scale * quant_data_new + this_min - data
+        mad = torch.sum((weights * torch.abs(diff)) if use_mad else weights * torch.pow(diff, 2), dim=-1, keepdim=True)
+
+        idx_to_replace = torch.where((mad < best_mad) & (D > 0))[0]
+        best_mad[idx_to_replace] = mad[idx_to_replace]
+        scale[idx_to_replace] = this_scale[idx_to_replace]
+        rmin[idx_to_replace] = this_min[idx_to_replace]
+
+    return scale.to(torch.float32), -rmin.to(torch.float32)
+
+# TODO consolidate iterative_wls_quant_search_chunk and non-chunk
 def iterative_wls_quant_search_chunk(
     data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8
 ):
@@ -543,16 +610,28 @@ def iterative_wls_quant_search(
     """
 
     # TODO this one should change to try catch later
-    return iterative_wls_quant_search_chunk(
-        data=data,
-        bits=bits,
-        rrmin=rrmin,
-        rdelta=rdelta,
-        nstep=nstep,
-        use_mad=use_mad,
-        weights=weights,
-        split_num=split_num,
-    )
+    if split_num>1:
+        return iterative_wls_quant_search_chunk(
+            data=data,
+            bits=bits,
+            rrmin=rrmin,
+            rdelta=rdelta,
+            nstep=nstep,
+            use_mad=use_mad,
+            weights=weights,
+            split_num=split_num,
+        )
+    else:
+        return iterative_wls_quant_search_non_chunk(
+            data=data,
+            bits=bits,
+            rrmin=rrmin,
+            rdelta=rdelta,
+            nstep=nstep,
+            use_mad=use_mad,
+            weights=weights,
+        )
+
 
 
 @torch.no_grad()

From ea310ec2eed98881b0c6c22186d742d2b30cc165 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 13 Nov 2025 13:29:32 +0000
Subject: [PATCH 08/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/data_type/gguf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
index ecf60f280..577ccf34e 100644
--- a/auto_round/data_type/gguf.py
+++ b/auto_round/data_type/gguf.py
@@ -460,6 +460,7 @@ def quant_tensor_gguf_asym_dq(
     qdq_result = revert_tensor_by_pad(qdq_result, orig_shape=orig_shape, pad_len=pad_len)
     return qdq_result, {"scale": scale, "d_scale": d_scale}, {"wmin": wmin, "d_wmin": d_wmin}
 
+
 def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None):
     """Adapted from Llamacpp. Performs iterative weighted least squares quantization search.
 
@@ -527,6 +528,7 @@ def iterative_wls_quant_search_non_chunk(data, bits=4, rrmin=-1.0, rdelta=0.1, n
 
     return scale.to(torch.float32), -rmin.to(torch.float32)
 
+
 # TODO consolidate iterative_wls_quant_search_chunk and non-chunk
 def iterative_wls_quant_search_chunk(
     data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, split_num=8
@@ -610,7 +612,7 @@ def iterative_wls_quant_search(
     """
 
     # TODO this one should change to try catch later
-    if split_num>1:
+    if split_num > 1:
         return iterative_wls_quant_search_chunk(
             data=data,
             bits=bits,
@@ -633,7 +635,6 @@ def iterative_wls_quant_search(
         )
 
 
-
 @torch.no_grad()
 def search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype):
     from auto_round.export.export_to_gguf.config import K_SCALE_SIZE, QK_K

From 967af5503f8e58c23012f4577dd77d644d9f470c Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 13 Nov 2025 21:37:25 +0800
Subject: [PATCH 09/18] update

---
 auto_round/compressors/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 25f3825c1..4e954a20a 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -152,7 +152,6 @@ def __init__(
         disable_opt_rtn: bool = False,
         seed: int = 42,
         low_cpu_mem_usage: bool = False,
-        momentum=0.0,
         **kwargs,
     ):
         """Initialize AutoRound with quantization and tuning configuration.
@@ -194,7 +193,7 @@ def __init__(
                   super_group_size, super_bits, scale_dtype ("fp16" etc.),
                   nblocks, to_quant_block_names,
                   enable_norm_bias_tuning, enable_quanted_input,
-                  disable_deterministic_algorithms, mllm, static_kv_dtype,enable_deterministic_algorithms
+                  disable_deterministic_algorithms, mllm, static_kv_dtype,enable_deterministic_algorithms,momentum
         Raises:
             ValueError: If invalid device is provided or tokenizer is missing for non-str model with iters > 0.
             RuntimeError: If model parameters are on meta device.
@@ -235,6 +234,7 @@ def __init__(
         enable_quanted_input: bool = kwargs.pop("enable_quanted_input", True)
         disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", True)
         enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False)
+        self.momentum = kwargs.pop("momentum", 0.0)
         static_kv_dtype = kwargs.pop("static_kv_dtype", None)
         model_dtype = kwargs.pop("model_dtype", None)
         device = kwargs.pop("device", None)
@@ -251,7 +251,7 @@ def __init__(
         self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
         self.scale_dtype = convert_dtype_str2torch(scale_dtype)
         self.low_cpu_mem_usage = low_cpu_mem_usage
-        self.momentum = momentum
+
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")

From 356ee30eda423be84efee5d06422242b8c913690 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 13 Nov 2025 13:38:55 +0000
Subject: [PATCH 10/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 4e954a20a..c4ad37bd4 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -252,7 +252,6 @@ def __init__(
         self.scale_dtype = convert_dtype_str2torch(scale_dtype)
         self.low_cpu_mem_usage = low_cpu_mem_usage
 
-
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
         if "CUBLAS_WORKSPACE_CONFIG" not in os.environ:

From 5f4d85cc0655f7789f5174473d57806dbc168801 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Thu, 13 Nov 2025 21:42:14 +0800
Subject: [PATCH 11/18] Update auto_round/__main__.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 1ddd07660..19fef935a 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -176,7 +176,7 @@ def __init__(self, *args, **kwargs):
             "--momentum",
             default=0,
             type=float,
-            help="",
+            help="Momentum factor for the optimizer. Default is 0 (no momentum).",
         )
         tuning.add_argument(
             "--gradient_accumulate_steps",

From a9fe2113b86ae32d42915b7e05424933d3fe9110 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 14 Nov 2025 09:06:57 +0800
Subject: [PATCH 12/18] update

---
 auto_round/compressors/base.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 4e954a20a..2432551fc 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2625,15 +2625,25 @@ def _quantize_block(
 
         lr = torch.tensor(self.lr)
         minmax_lr = torch.tensor(self.minmax_lr)
+        is_adam = "adam" in self.__class__.__name__.lower()
+
+
+        extra_kwargs = {} if is_adam else {"momentum": self.momentum}
+
         if self.enable_minmax_tuning:
-            optimizer = self.optimizer(
-                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}],
-                lr=lr,
-                weight_decay=0,
-                momentum=self.momentum,
-            )
+            params = [
+                {"params": round_params},
+                {"params": minmax_params, "lr": minmax_lr},
+            ]
         else:
-            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0, momentum=self.momentum)
+            params = round_params
+
+        optimizer = self.optimizer(
+            params,
+            lr=lr,
+            weight_decay=0,
+            **extra_kwargs,
+        )
 
         if len(round_params) + len(minmax_params) <= 0:
             dump_info = (

From 63ae0c21c784fa13e26ba4c371f331387137b6e2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Nov 2025 01:08:06 +0000
Subject: [PATCH 13/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 6578b0f86..23ffd0df5 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2626,7 +2626,6 @@ def _quantize_block(
         minmax_lr = torch.tensor(self.minmax_lr)
         is_adam = "adam" in self.__class__.__name__.lower()
 
-
         extra_kwargs = {} if is_adam else {"momentum": self.momentum}
 
         if self.enable_minmax_tuning:

From 36d41af93061eef978540638574597f4a56f7444 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 14 Nov 2025 09:11:22 +0800
Subject: [PATCH 14/18] refine comments

---
 auto_round/compressors/base.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index c8d770dc2..00e7fded2 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1558,11 +1558,13 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         # It is best to modify the model structure in the quantize function and check the format,
         # because it may cause the gguf format to not be exported normally.
         self.model = _handle_moe_model(self.model, formats=formats)
-        # Assign temporary names after replacing modules
-        for n, m in self.model.named_modules():  # TODO check if could removed
+
+        # Temporary names must be assigned after handle_moe_model;
+        # placing them earlier would cause them to be removed when the module is replaced.
+        for n, m in self.model.named_modules():
             m.tmp_name = n
 
-        # TODO check scale_dtype
+
         if not self.is_auto_scheme:
             enable_gguf_official_mixed = True
         else:

From a3a19e2dd508384d8719449e3cd188a98ce5588b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Nov 2025 01:11:57 +0000
Subject: [PATCH 15/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 00e7fded2..daffe0acc 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1564,7 +1564,6 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         for n, m in self.model.named_modules():
             m.tmp_name = n
 
-
         if not self.is_auto_scheme:
             enable_gguf_official_mixed = True
         else:

From c58403965bbcf53e510d3146dcf7e483217b5cab Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 14 Nov 2025 10:19:59 +0800
Subject: [PATCH 16/18] update readme

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 29eeae56f..acec0b3e3 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ See our [paper](https://arxiv.org/pdf/2309.05516) for more details. For usage in
 
 
 ## 🆕 What's New
-[2025/11] AutoRound now offers preliminary support for an **enhanced GGUF quantization algorithm** via `--enable_alg_ext`. For detailed accuracy benchmarks, please refer to the accompanying [documentation](./docs/gguf_alg_ext_acc.md).
+[2025/11] AutoRound now offers preliminary support for an enhanced GGUF quantization algorithm via `--enable_alg_ext`. For detailed accuracy benchmarks, please refer to the [documentation](./docs/gguf_alg_ext_acc.md).
 
 [2025/10] AutoRound has been integrated into **SGLang**. You can now run models in the AutoRound format directly using the latest SGLang later than v0.5.4.
 
@@ -192,7 +192,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
 - **`layer_config` (dict)**: Configuration for weight quantization (default is `None`), mainly for mixed schemes.
 
 ##### Algorithm Settings
-- **`enable_alg_ext` (bool)**: Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
+- **`enable_alg_ext` (bool)**: [Experimental Feature] Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
 - **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled).
 
 ##### Tuning Process Parameters
@@ -208,6 +208,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
 ##### Device/Speed Configuration
 - **`enable_torch_compile` (bool)**: If no exception is raised, typically we recommend setting it to True for faster quantization with lower resource.
 - **`low_gpu_mem_usage` (bool)**: Whether to offload intermediate features to CPU at the cost of ~20% more tuning time (default is `False`).
+- **`low_cpu_mem_usage` (bool)**: [Experimental Feature]Whether to enable saving immediately to save ram usage (default is `False`).
 - **`device_map` (str|dict|int)**: The device to be used for tuning, e.g., `auto`, "cpu"`, `"cuda"`, `"0,1,2"` (default is `'0'`). When using "auto", it will try to use all available GPUs.
 
 </details>

From 267ff642ecf52f70ebd77a5183fc8bd2969280ed Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 14 Nov 2025 13:04:12 +0800
Subject: [PATCH 17/18] refine readme

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index acec0b3e3..598c3fd54 100644
--- a/README.md
+++ b/README.md
@@ -46,8 +46,7 @@ refer to the documentation for accuracy [results](./docs/auto_scheme_acc.md) and
  for some accuracy results. 
 
 [2025/07] AutoRound now offers experimental support for **GGUF** format, and recommends using optimized RTN mode (--iters 0) for
-  all bits other than 3 bits. **A more advanced algorithm** tailored for specific configurations may be available in
-  v0.8.1.
+  all bits other than 3 bits. 
 
 [2025/05] AutoRound has been integrated into **Transformers** and **vLLM**. 
 

From 0bc902fd5b99815e01d274a75e7e08a906c3f10c Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Fri, 14 Nov 2025 13:18:13 +0800
Subject: [PATCH 18/18] refine

---
 auto_round/export/export_to_gguf/packing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
index f54f78a02..bc9189b7b 100644
--- a/auto_round/export/export_to_gguf/packing.py
+++ b/auto_round/export/export_to_gguf/packing.py
@@ -59,7 +59,6 @@ def ggml_quant(
             blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
         )
     except Exception:
-        clear_memory()
         device = "cpu"
         blocks = blocks.to(device)
         scale = scale.to(device) if scale is not None else scale
@@ -68,6 +67,7 @@ def ggml_quant(
         d_scale = d_scale.to(device) if d_scale is not None else d_scale
         d_wmin = d_wmin.to(device) if d_wmin is not None else d_wmin
         imatrix = imatrix.to(device) if imatrix is not None else imatrix
+        clear_memory()
         new_data = quant_func(
             blocks, scale, zp=zp, wmin=wmin, d_scale=d_scale, d_wmin=d_wmin, imatrix=imatrix, original=original
         )