intel · xin3he · Apr 14, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -17,11 +17,13 @@
 import re
 import sys
 
+import torch
+
 from auto_round.auto_scheme import AutoScheme
 from auto_round.compressors import BaseCompressor
 from auto_round.eval.eval_cli import EvalArgumentParser, eval, eval_task_by_task
 from auto_round.eval.evaluation import run_model_evaluation
-from auto_round.schemes import PRESET_SCHEMES
+from auto_round.schemes import PRESET_SCHEMES, preset_name_to_scheme
 from auto_round.utils import (
     clear_memory,
     get_device_and_parallelism,
@@ -183,7 +185,21 @@ def __init__(self, *args, **kwargs):
             help="Disable trusting remote code when loading models. "
             "Use for security if you don't trust the model source.",
         )
-
+        basic.add_argument(
+            "--model_free",
+            action="store_true",
+            help="Force model-free quantization mode. "
+            "Downloads and quantizes safetensors files directly using RTN, "
+            "without loading the full model into memory. "
+            "Only supports auto_round output format.",
+        )
+        basic.add_argument(
+            "--disable_model_free",
+            action="store_true",
+            help="Disable the automatic model-free routing that activates when "
+            "--iters 0 --disable_opt_rtn is combined with a supported INT WOQ scheme. "
+            "Use this to force the regular AutoRound flow.",
+        )
         tuning = self.add_argument_group("Tuning Arguments")
         tuning.add_argument(
             "--ignore_scale_zp_bits",
@@ -595,9 +611,73 @@ def tune(args):
     if "marlin" in args.format and args.asym is True:
         raise RuntimeError("marlin backend only supports sym quantization, please remove --asym")
 
-    device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)
+    # ======================= Model-Free Mode =======================
+    # The model-free path is now integrated into AutoRound itself.  We only
+    # need to forward the relevant flags; AutoRound handles auto-routing
+    # (when iters=0 + disable_opt_rtn + supported scheme) and explicit
+    # ``--model_free``.  Layer config / ignore-layers / output-dir handling
+    # for model-free still needs special treatment because the model is
+    # never loaded here.
+    explicit_model_free = bool(getattr(args, "model_free", False))
+    from auto_round.compressors.model_free import is_model_free_supported_scheme
+
+    auto_model_free = (
+        not explicit_model_free
+        and not getattr(args, "disable_model_free", False)
+        and getattr(args, "iters", None) == 0
+        and getattr(args, "disable_opt_rtn", None) is True
+        and is_model_free_supported_scheme(args.scheme, vars(args))
+        and (
+            str(getattr(args, "format", "auto_round") or "auto_round").lower().replace(" ", "").split(",")[0]
+            == "auto_round"
+            or format.startswith("auto_round")
+        )
+    )
+
+    if explicit_model_free or auto_model_free:
+        scheme = args.scheme.upper()
+        if scheme not in PRESET_SCHEMES:
+            raise ValueError(f"{scheme} is not supported. Only {list(PRESET_SCHEMES.keys())} are supported")
+        if not is_model_free_supported_scheme(scheme, vars(args)) and not explicit_model_free:
+            logger.info(
+                f"Auto-routing to model-free is skipped: scheme '{scheme}' is not in "
+                f"the model-free allowlist. Falling back to the regular AutoRound flow."
+            )
+        else:
+            layer_config = {}
+            if args.layer_config:
+                layer_config = parse_layer_config_arg(args.layer_config)
+
+            model_name = args.model.rstrip("/")
+            output_dir = args.output_dir
+            if output_dir == "./tmp_autoround" and model_name.split("/")[-1].strip(".") != "":
+                s = preset_name_to_scheme(scheme)
+                suffix = f"g{s.group_size}" if s.group_size > 0 else f"a{s.act_bits}"
+                output_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{s.bits}{suffix}")
+
+            from auto_round import AutoRound
+
+            ar_kwargs = dict(
+                scheme=scheme,
+                iters=0,
+                disable_opt_rtn=True,
+                model_free=True,
+                layer_config=layer_config,
+                ignore_layers=args.ignore_layers,
+                quant_lm_head=getattr(args, "quant_lm_head", False),
+                quant_nontext_module=getattr(args, "quant_nontext_module", False),
+                device_map=args.device_map,
+            )
+            if args.asym:
+                ar_kwargs["sym"] = False
+            if args.group_size:
+                ar_kwargs["group_size"] = args.group_size
 
-    import torch
+            ar = AutoRound(model_name, **ar_kwargs)
+            ar.quantize_and_save(output_dir=output_dir, format=args.format)  # pylint: disable=E1101
+            return
+
+    device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)
 
     if args.enable_torch_compile:
         logger.info(

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -27,9 +27,10 @@
     MLLMCompressor,
 )
 from auto_round.compressors.diffusion.hybrid import HybridCompressor, is_hybrid_diffusion_model
+from auto_round.compressors.model_free import ModelFreeCompressor
 from auto_round.logger import deprecated, logger
 from auto_round.schemes import QuantizationScheme
-from auto_round.utils import is_diffusion_model, is_mllm_model
+from auto_round.utils import is_diffusion_model, is_mllm_model, is_model_free_route
 
 if TYPE_CHECKING:
     from auto_round.auto_scheme.gen_auto_scheme import AutoScheme
@@ -169,6 +170,22 @@ def __new__(
 
             return AutoRoundCompatible(**local_args, **kwargs)
 
+        # ---- Model-free fast-path detection --------------------------------
+        if is_model_free_route(model, scheme, iters, disable_opt_rtn, kwargs):
+            if not isinstance(model, str):
+                raise ValueError("model_free=True requires `model` to be a HuggingFace ID or local path string.")
+            if not bool(kwargs.get("model_free", False)):
+                logger.info(
+                    "Auto-routing to model-free quantization "
+                    "(iters=0, disable_opt_rtn=True, supported scheme). "
+                    "Pass disable_model_free=True to use the regular flow."
+                )
+            if extra_config is not None:
+                local_args.update(extra_config.to_dict())
+            local_args["model_name_or_path"] = local_args.pop("model")
+            return ModelFreeCompressor(**local_args, **kwargs)
+        # --------------------------------------------------------------------
+
         model_cls = []
 
         has_multimodal_assets = kwargs.get("processor") is not None or kwargs.get("image_processor") is not None

diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py
@@ -18,6 +18,7 @@
 from auto_round.compressors.mllm.compressor import MLLMCompressor
 from auto_round.compressors.diffusion.compressor import DiffusionCompressor
 from auto_round.compressors.diffusion.hybrid import HybridCompressor
+from auto_round.compressors.model_free import ModelFreeCompressor
 from auto_round.compressors.config import (
     DiffusionExtraConfig,
     ExtraConfig,