Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
dc592e9
implement model free
xin3he Apr 14, 2026
177bf48
polished implementation
xin3he Apr 15, 2026
97e0362
remove useless gpu_concurrency
xin3he Apr 15, 2026
ff47a97
添加预编译模式匹配器以提高量化过程中的性能和可扩展性
xin3he Apr 15, 2026
4d9ad0e
fix typo
xin3he Apr 15, 2026
58709e6
update document
xin3he Apr 16, 2026
d3951f2
remove useless code and update UT
xin3he Apr 16, 2026
16991ea
mend
xin3he Apr 16, 2026
83b9b4f
remove high_gpu_mem_usage since no performacen benefit.
xin3he Apr 16, 2026
687260d
update regex
xin3he Apr 16, 2026
68d0cb7
fix bug and simplify UT
xin3he Apr 16, 2026
312f75d
fix bug
xin3he Apr 17, 2026
3ca4d3b
add WOQ limiation and support bits group_size setting
xin3he Apr 17, 2026
3f15e02
Merge branch 'main' into xinhe/4-14
xin3he Apr 17, 2026
47b3f35
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 17, 2026
76f9915
update doc
xin3he Apr 17, 2026
c588ad2
minor fix
xin3he Apr 17, 2026
0c14165
enable quant_nontext_module
xin3he Apr 20, 2026
405de53
Enhance model-free quantization support and improve documentation
xin3he Apr 23, 2026
6c5ce29
Merge remote-tracking branch 'origin/main' into xinhe/4-14
xin3he Apr 24, 2026
0697324
support loading pytorch_model.bin and ignore conv1d embed by creating…
xin3he Apr 24, 2026
f4fc5f4
add UT to cover conv1d detection
xin3he Apr 24, 2026
4f6f97e
support MXFP4/8 dequantization
xin3he Apr 24, 2026
ed46cd6
Merge branch 'main' into xinhe/4-14
xin3he Apr 24, 2026
7e3a3f8
fix pylint
xin3he Apr 24, 2026
958191a
Merge branch 'main' into xinhe/4-14
xin3he Apr 25, 2026
7440c32
add auto fallback and change class name
xin3he Apr 25, 2026
8b8d084
fix CI
xin3he Apr 25, 2026
eb5fdf4
update readme
xin3he Apr 26, 2026
98a5040
添加回退压缩器功能以支持量化和保存
xin3he Apr 26, 2026
46465c3
Merge branch 'main' into xinhe/4-14
xin3he Apr 26, 2026
7c76188
support diffusion model
xin3he Apr 27, 2026
a92acc2
fix bug
xin3he Apr 27, 2026
46ed32c
support layer_config={".ffn.experts.": {"scheme": "W2A16"}} usage
xin3he Apr 28, 2026
6f41cec
fix bug
xin3he Apr 28, 2026
9f81c67
update UT
xin3he Apr 28, 2026
16ead43
fix bug
xin3he Apr 29, 2026
48994a4
Merge remote-tracking branch 'origin/main' into xinhe/4-14
xin3he Apr 29, 2026
3d9812c
add model free for new arch
xin3he Apr 29, 2026
bd31861
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 84 additions & 4 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
import re
import sys

import torch

Comment thread
xin3he marked this conversation as resolved.
from auto_round.auto_scheme import AutoScheme
from auto_round.compressors import BaseCompressor
from auto_round.eval.eval_cli import EvalArgumentParser, eval, eval_task_by_task
from auto_round.eval.evaluation import run_model_evaluation
from auto_round.schemes import PRESET_SCHEMES
from auto_round.schemes import PRESET_SCHEMES, preset_name_to_scheme
from auto_round.utils import (
clear_memory,
get_device_and_parallelism,
Expand Down Expand Up @@ -183,7 +185,21 @@ def __init__(self, *args, **kwargs):
help="Disable trusting remote code when loading models. "
"Use for security if you don't trust the model source.",
)

basic.add_argument(
"--model_free",
action="store_true",
help="Force model-free quantization mode. "
"Downloads and quantizes safetensors files directly using RTN, "
"without loading the full model into memory. "
"Only supports auto_round output format.",
)
basic.add_argument(
"--disable_model_free",
action="store_true",
help="Disable the automatic model-free routing that activates when "
"--iters 0 --disable_opt_rtn is combined with a supported INT WOQ scheme. "
"Use this to force the regular AutoRound flow.",
)
tuning = self.add_argument_group("Tuning Arguments")
tuning.add_argument(
"--ignore_scale_zp_bits",
Expand Down Expand Up @@ -595,9 +611,73 @@ def tune(args):
if "marlin" in args.format and args.asym is True:
raise RuntimeError("marlin backend only supports sym quantization, please remove --asym")

device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)
# ======================= Model-Free Mode =======================
# The model-free path is now integrated into AutoRound itself. We only
# need to forward the relevant flags; AutoRound handles auto-routing
# (when iters=0 + disable_opt_rtn + supported scheme) and explicit
# ``--model_free``. Layer config / ignore-layers / output-dir handling
# for model-free still needs special treatment because the model is
# never loaded here.
explicit_model_free = bool(getattr(args, "model_free", False))
from auto_round.compressors.model_free import is_model_free_supported_scheme

auto_model_free = (
not explicit_model_free
and not getattr(args, "disable_model_free", False)
and getattr(args, "iters", None) == 0
and getattr(args, "disable_opt_rtn", None) is True
and is_model_free_supported_scheme(args.scheme, vars(args))
and (
str(getattr(args, "format", "auto_round") or "auto_round").lower().replace(" ", "").split(",")[0]
== "auto_round"
or format.startswith("auto_round")
)
)

if explicit_model_free or auto_model_free:
scheme = args.scheme.upper()
if scheme not in PRESET_SCHEMES:
raise ValueError(f"{scheme} is not supported. Only {list(PRESET_SCHEMES.keys())} are supported")
if not is_model_free_supported_scheme(scheme, vars(args)) and not explicit_model_free:
logger.info(
f"Auto-routing to model-free is skipped: scheme '{scheme}' is not in "
f"the model-free allowlist. Falling back to the regular AutoRound flow."
)
else:
layer_config = {}
if args.layer_config:
layer_config = parse_layer_config_arg(args.layer_config)

model_name = args.model.rstrip("/")
output_dir = args.output_dir
if output_dir == "./tmp_autoround" and model_name.split("/")[-1].strip(".") != "":
s = preset_name_to_scheme(scheme)
suffix = f"g{s.group_size}" if s.group_size > 0 else f"a{s.act_bits}"
output_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{s.bits}{suffix}")

from auto_round import AutoRound

ar_kwargs = dict(
scheme=scheme,
iters=0,
disable_opt_rtn=True,
model_free=True,
layer_config=layer_config,
ignore_layers=args.ignore_layers,
quant_lm_head=getattr(args, "quant_lm_head", False),
quant_nontext_module=getattr(args, "quant_nontext_module", False),
device_map=args.device_map,
)
if args.asym:
ar_kwargs["sym"] = False
if args.group_size:
ar_kwargs["group_size"] = args.group_size

import torch
ar = AutoRound(model_name, **ar_kwargs)
ar.quantize_and_save(output_dir=output_dir, format=args.format) # pylint: disable=E1101
return

device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)

if args.enable_torch_compile:
logger.info(
Expand Down
19 changes: 18 additions & 1 deletion auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@
MLLMCompressor,
)
from auto_round.compressors.diffusion.hybrid import HybridCompressor, is_hybrid_diffusion_model
from auto_round.compressors.model_free import ModelFreeCompressor
from auto_round.logger import deprecated, logger
from auto_round.schemes import QuantizationScheme
from auto_round.utils import is_diffusion_model, is_mllm_model
from auto_round.utils import is_diffusion_model, is_mllm_model, is_model_free_route

if TYPE_CHECKING:
from auto_round.auto_scheme.gen_auto_scheme import AutoScheme
Expand Down Expand Up @@ -169,6 +170,22 @@ def __new__(

return AutoRoundCompatible(**local_args, **kwargs)

# ---- Model-free fast-path detection --------------------------------
if is_model_free_route(model, scheme, iters, disable_opt_rtn, kwargs):
if not isinstance(model, str):
raise ValueError("model_free=True requires `model` to be a HuggingFace ID or local path string.")
if not bool(kwargs.get("model_free", False)):
logger.info(
"Auto-routing to model-free quantization "
"(iters=0, disable_opt_rtn=True, supported scheme). "
"Pass disable_model_free=True to use the regular flow."
)
if extra_config is not None:
local_args.update(extra_config.to_dict())
local_args["model_name_or_path"] = local_args.pop("model")
return ModelFreeCompressor(**local_args, **kwargs)
# --------------------------------------------------------------------

model_cls = []

has_multimodal_assets = kwargs.get("processor") is not None or kwargs.get("image_processor") is not None
Expand Down
1 change: 1 addition & 0 deletions auto_round/compressors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from auto_round.compressors.mllm.compressor import MLLMCompressor
from auto_round.compressors.diffusion.compressor import DiffusionCompressor
from auto_round.compressors.diffusion.hybrid import HybridCompressor
from auto_round.compressors.model_free import ModelFreeCompressor
from auto_round.compressors.config import (
DiffusionExtraConfig,
ExtraConfig,
Expand Down
Loading
Loading