intel · wenhuach21 · Oct 16, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/README.md b/README.md
@@ -27,6 +27,8 @@ and [fbaldassarri](https://huggingface.co/fbaldassarri). For usage instructions,
 
 
 ## 🆕 What's New
+[2025/10] AutoRound team proposed a fast algorithm to generate mixed bits/datatypes schemes in minutes. Please
+refer to the documentation for accuracy [results](./docs/auto_scheme_acc.md) and [this guide](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#autoscheme) for usage instructions.
 
 [2025/09] AutoRound now includes experimental support for the mxfp4 and nvfp4 dtypes. For accuracy results, see the [documentation](./docs/mxnv_acc.md)
 . We currently recommend exporting to the LLM-Compressor format.
@@ -38,7 +40,7 @@ and [fbaldassarri](https://huggingface.co/fbaldassarri). For usage instructions,
   all bits other than 3 bits. Example
   models: [Intel/Qwen3-235B-A22B-q2ks-mixed-AutoRound](https://huggingface.co/Intel/Qwen3-235B-A22B-q2ks-mixed-AutoRound)
   and [Intel/DeepSeek-R1-0528-q2ks-mixed-AutoRound](https://huggingface.co/Intel/DeepSeek-R1-0528-q2ks-mixed-AutoRound). **A more advanced algorithm** tailored for specific configurations may be available in
-  v0.7.1.
+  v0.8.1.
 
 [2025/05] AutoRound has been integrated into **vLLM**. You can now run models in the AutoRound format directly with
   vLLM versions later than v0.85.post1.
@@ -65,6 +67,9 @@ Support **AutoRound, AutoAWQ, AutoGPTQ, and GGUF** for maximum compatibility. De
 ✅ **Affordable Quantization Cost**
 Quantize 7B models in about 10 minutes on a single GPU. Details are shown in [quantization costs](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#quantization-costs)
 
+✅ **Fast mixed bits/data-types scheme generation**
+Automatically configure in minutes, with about 2X-4X the model’s BF16 VRAM size as overhead.
+
 ✅ **10+ VLMs Support**
 Out-of-the-box quantization for 10+ vision-language models [example models](https://huggingface.co/collections/OPEA/vlms-autoround-675bc712fdd6a55ebaf11bfa), [support matrix](https://github.com/intel/auto-round/tree/main/auto_round/mllm#support-matrix)
 
@@ -111,7 +116,7 @@ pip install auto-round-lib
 ## Model Quantization (CPU/Intel GPU/Gaudi/CUDA)
 
 ### CLI Usage
-Please change to `auto-round-mllm` for visual-language models (VLMs) quantization. The full list of supported arguments is provided by calling `auto-round -h` on the terminal.
+The full list of supported arguments is provided by calling `auto-round -h` on the terminal.
 
 ```bash
 auto-round \

diff --git a/auto_round/__init__.py b/auto_round/__init__.py
@@ -15,7 +15,7 @@
 
 # support for old api
 from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion
-from auto_round.schemes import QuantizationScheme
+from auto_round.schemes import QuantizationScheme, AutoScheme
 from auto_round.utils import LazyImport
 
 

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import logging
 import os
-import re
 import sys
 
 from auto_round.compressors import BaseCompressor
 from auto_round.eval.eval_cli import EvalArgumentParser, _eval_init, eval, eval_task_by_task
-from auto_round.schemes import PRESET_SCHEMES
+from auto_round.schemes import PRESET_SCHEMES, AutoScheme
 from auto_round.utils import (
     clear_memory,
     get_device_and_parallelism,
     get_model_dtype,
-    set_cuda_visible_devices,
 )
 
 RECIPES = {
@@ -66,6 +63,11 @@ def __init__(self, *args, **kwargs):
             help="The batch size for tuning/calibration."
             "Larger batch sizes may improve stability but require more memory.",
         )
+        basic.add_argument("--avg_bits", default=None, type=float, help="for auto scheme, number of avg weight bits")
+        basic.add_argument(
+            "--options", default=None, type=str, help="for auto scheme, options for auto scheme, e.g. 'W4A16,W8A16'"
+        )
+
         basic.add_argument(
             "--iters",
             "--iter",
@@ -138,6 +140,11 @@ def __init__(self, *args, **kwargs):
         )
 
         tuning = self.add_argument_group("Tuning Arguments")
+        tuning.add_argument(
+            "--ignore_scale_zp_bits",
+            action="store_true",
+            help="for auto scheme whether ignore scale zp bits calculation ",
+        )
         tuning.add_argument(
             "--lr",
             default=None,
@@ -176,7 +183,7 @@ def __init__(self, *args, **kwargs):
         )
         tuning.add_argument(
             "--scale_dtype",
-            default="fp16",
+            default=None,
             choices=["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"],
             help="Data type for quantization scales. "
             "fp16/bf16: lower memory, fp32: higher precision. "
@@ -452,8 +459,6 @@ def tune(args):
     if "marlin" in args.format and args.asym is True:
         raise RuntimeError("marlin backend only supports sym quantization, please remove --asym")
 
-    # Must set this before import torch
-    # set_cuda_visible_devices(args.device_map)
     device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)
 
     import torch
@@ -549,6 +554,15 @@ def tune(args):
     extra_config.mllm_config = mllm_config
     extra_config.diffusion_config = diffusion_config
 
+    layer_config = {}
+
+    if args.avg_bits is not None:
+        if args.options is None:
+            raise ValueError("please set --options for auto scheme")
+        scheme = AutoScheme(
+            options=args.options, avg_bits=args.avg_bits, ignore_scale_zp_bits=args.ignore_scale_zp_bits
+        )
+
     autoround: BaseCompressor = AutoRound(
         model=model_name,
         scheme=scheme,
@@ -565,6 +579,7 @@ def tune(args):
         not_use_best_mse=args.not_use_best_mse,
         enable_adam=args.adam,
         extra_config=extra_config,
+        layer_config=layer_config,
     )
 
     model_name = args.model.rstrip("/")

diff --git a/auto_round/auto_scheme/__init__.py b/auto_round/auto_scheme/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+AUTO_SCHEME_METHODS = {}
+
+
+def register_scheme_methods(names):
+    """Class decorator to register a mixed precision algorithm to the registry.
+
+    Decorator function used before a Pattern subclass.
+
+    Args:
+        names: A string. Define the export type.
+
+    Returns:
+        cls: The class of register.
+    """
+
+    def register(alg):
+        if isinstance(names, (tuple, list)):
+            for name in names:
+                AUTO_SCHEME_METHODS[name] = alg
+        else:
+            AUTO_SCHEME_METHODS[names] = alg
+
+        return alg
+
+    return register
+
+
+import auto_round.auto_scheme.default_alg
diff --git a/auto_round/auto_scheme/default_alg.abi3.so b/auto_round/auto_scheme/default_alg.abi3.so
diff --git a/auto_round/auto_scheme/gen_auto_scheme.py b/auto_round/auto_scheme/gen_auto_scheme.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import asdict
+from typing import Iterable, Union
+
+import torch
+
+from auto_round import AutoScheme
+from auto_round.auto_scheme import AUTO_SCHEME_METHODS
+from auto_round.auto_scheme.utils import compute_avg_bits_for_scheme
+from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG
+from auto_round.logger import logger
+from auto_round.utils import _gguf_type_fallback, get_layer_features, get_module
+
+
+class GenScheme:
+    """Generate and validate quantization schemes for model layers."""
+
+    def __init__(
+        self,
+        auto_scheme: AutoScheme,  # TODO support shared layer
+        model: torch.nn.Module,
+        quant_layer_names: Iterable[str],
+        fixed_layer_scheme: dict[str, dict],
+        dataset: str = "pile-10k",  # TODO use auto-round dataset
+        device_map: Union[str, torch.device, int, dict, None] = None,
+        tokenizer=None,
+        enable_torch_compile=False,
+    ):
+        self.auto_scheme = auto_scheme
+        self.model = model
+        self.tokenizer = tokenizer
+        self.quant_layer_names = quant_layer_names
+        self.fixed_layer_scheme = fixed_layer_scheme
+        self.dataset = dataset
+        self.device_map = device_map if self.auto_scheme.device_map is None else self.auto_scheme.device_map
+        self.enable_torch_compile = enable_torch_compile
+        self._check_configs()
+
+    def _check_configs(self) -> None:
+        """Validate auto_scheme configuration and ensure avg_bits target is valid."""
+        if isinstance(self.model, torch.nn.Module) and self.tokenizer is None:
+            raise ValueError("tokenizer must not be None if model is nn.Module")
+
+        if not isinstance(self.dataset, str):
+            raise TypeError("`dataset` must be a string, got {type(self.dataset).__name__}.")
+
+        min_avg_bit, max_avg_bit = self.compute_avg_bit_range()
+        target = self.auto_scheme.avg_bits
+
+        logger.info("Average bits range: [%.3f, %.3f], target = %.3f", min_avg_bit, max_avg_bit, target)
+        if abs(target - min_avg_bit) < 1e-3 or abs(target - max_avg_bit) < 1e-3:
+            if abs(target - min_avg_bit) < 1e-3:
+                target = min_avg_bit
+            else:
+                target = max_avg_bit
+            self.auto_scheme.avg_bits = target
+
+        if not (min_avg_bit <= target <= max_avg_bit):
+            raise ValueError(
+                f"Target avg_bits={target:.3f} is outside the valid range " f"[{min_avg_bit:.3f}, {max_avg_bit:.3f}]."
+            )
+
+    def get_layer_config(self) -> dict[str, dict]:
+        method_name = self.auto_scheme.method
+        method_func = AUTO_SCHEME_METHODS[method_name]
+        layer_config = method_func(
+            self.auto_scheme,
+            self.model,
+            self.quant_layer_names,
+            self.fixed_layer_scheme,
+            self.dataset,
+            self.tokenizer,
+            device_map=self.device_map,
+            enable_torch_compile=self.enable_torch_compile,
+        )
+        layer_config = self.fallback_gguf_layer_config(layer_config)
+        return layer_config
+
+    def fallback_gguf_layer_config(self, layer_config: dict[str, dict]) -> dict[str, dict]:
+        """
+        Apply fallback configurations for GGUF quantized layers when the current
+        layer configuration is incompatible with input feature alignment.
+
+        Args:
+            layer_config (dict[str, dict]): Mapping from layer name to its quantization scheme.
+
+        Returns:
+            dict[str, dict]: Updated layer configuration with applied fallbacks if necessary.
+        """
+        for name, scheme in layer_config.items():  # TODO: add unit test (wenhua), the code is a little tricky
+            if scheme.get("super_bits") is None:
+                continue  # Skip non-GGUF k-quant layers
+
+            layer = get_module(self.model, name)
+            input_features, out_features = get_layer_features(layer)
+            if input_features is None:
+                continue
+            if input_features % 256 == 0 or isinstance(layer, torch.nn.Embedding):
+                continue
+
+            # Determine fallback quantization type
+            if input_features % 256 != 0 and input_features % 32 != 0:
+                new_type = "gguf:bf16"
+            elif input_features % 256 != 0:
+                bits = scheme["bits"]
+                prefix_idx = 0 if scheme["sym"] else 1
+                new_type = f"gguf:q{bits}_" + f"{prefix_idx}"
+                if new_type not in GGUF_INNER_CONFIG:
+                    new_type = f"gguf:q{bits}_" + f"{1 - prefix_idx}"
+                    if new_type not in GGUF_INNER_CONFIG:
+                        current_type = f"gguf:q{bits}_k"
+                        new_type = _gguf_type_fallback(current_type)
+
+            # Apply fallback configuration
+            target_config = GGUF_INNER_CONFIG[new_type]
+            for key in scheme.keys():
+                if key in target_config:
+                    scheme[key] = target_config[key]
+
+            logger.warning(f"Fallback applied: {name} → {new_type}")
+
+        return layer_config
+
+    def compute_avg_bit_range(self) -> tuple[float, float]:
+        """Compute the min and max average bitwidths among candidate quantization options."""
+        avg_bits = [
+            compute_avg_bits_for_scheme(
+                self.model,
+                self.quant_layer_names,
+                self.fixed_layer_scheme,
+                option,
+                self.auto_scheme.ignore_scale_zp_bits,
+            )[0]
+            for option in self.auto_scheme.options
+        ]
+        return min(avg_bits), max(avg_bits)