From bd3824cd75c582759cffa6fe45da7fa937d97b9b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 29 Oct 2025 01:51:29 -0400 Subject: [PATCH 1/6] support for model scope Signed-off-by: n1ck-guo --- auto_round/__main__.py | 7 ++ auto_round/autoround.py | 14 ++- auto_round/compressors/adam.py | 3 + auto_round/compressors/base.py | 4 + .../compressors/diffusion/compressor.py | 5 +- auto_round/compressors/mllm/compressor.py | 6 +- auto_round/compressors/utils.py | 6 +- auto_round/envs.py | 2 + auto_round/export/export_to_gguf/convert.py | 4 +- auto_round/export/export_to_gguf/export.py | 4 +- .../export/export_to_gguf/special_handle.py | 4 +- auto_round/utils/model.py | 103 ++++++++++++++---- 12 files changed, 130 insertions(+), 32 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index c403ee863..6a30d0233 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -44,6 +44,12 @@ def __init__(self, *args, **kwargs): help="Path to the pre-trained model or model identifier from huggingface.co/models. " "Examples: 'facebook/opt-125m', 'bert-base-uncased', or local path like '/path/to/model'", ) + basic.add_argument( + "--platform", + default="hf", + help="Platform to load the pre-trained model. Options: [hf, model_scope]." + " hf stands for huggingface and model_scope stands for model scope.", + ) basic.add_argument( "--scheme", default="W4A16", @@ -565,6 +571,7 @@ def tune(args): autoround: BaseCompressor = AutoRound( model=model_name, + platform=args.platform, scheme=scheme, dataset=args.dataset, iters=args.iters, diff --git a/auto_round/autoround.py b/auto_round/autoround.py index a78c94737..7e717f3a9 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -39,6 +39,7 @@ class AutoRound: Attributes: model (torch.nn.Module): The loaded PyTorch model in eval mode. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: Tokenizer used to prepare input text for calibration/tuning. bits (int): Weight quantization bits. group_size (int): Per-group size for weight quantization. @@ -63,6 +64,7 @@ class AutoRound: def __new__( cls, model: Union[torch.nn.Module, str], + platform: str = "hf", tokenizer=None, scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, @@ -143,7 +145,7 @@ def __new__( """ model_cls = [] - if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model): + if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform): logger.info("using MLLM mode for multimodal model.") model_cls.append(MLLMCompressor) if extra_config: @@ -166,6 +168,7 @@ def __new__( kwargs.update(extra_config.to_dict()) ar = dynamic_compressor( model=model, + platform=platform, tokenizer=tokenizer, scheme=scheme, layer_config=layer_config, @@ -310,6 +313,7 @@ class AutoRoundLLM(LLMCompressor): def __init__( self, model: Union[torch.nn.Module, str], + platform: str = "hf", tokenizer=None, scheme: Union[str, dict, QuantizationScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, @@ -327,6 +331,7 @@ def __init__( ): super().__init__( model=model, + platform=platform, tokenizer=tokenizer, scheme=scheme, layer_config=layer_config, @@ -350,6 +355,7 @@ class AutoRoundAdam(AdamCompressor): Args: model: The PyTorch model to be quantized. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data. scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations bits (int): Number of bits for quantization (default is 4). @@ -409,6 +415,7 @@ class AutoRoundAdam(AdamCompressor): def __init__( self, model: Union[torch.nn.Module, str], + platform: str = "hf", tokenizer=None, scheme: Union[str, dict, QuantizationScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, @@ -427,6 +434,7 @@ def __init__( ): super().__init__( model=model, + platform=platform, tokenizer=tokenizer, scheme=scheme, layer_config=layer_config, @@ -451,6 +459,7 @@ class AutoRoundMLLM(MLLMCompressor): Args: model: The PyTorch model to be quantized. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data. processor: Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text, vision and audio). @@ -509,6 +518,7 @@ class AutoRoundMLLM(MLLMCompressor): def __init__( self, model: Union[torch.nn.Module, str], + platform: str = "hf", tokenizer=None, processor=None, image_processor=None, @@ -529,6 +539,7 @@ def __init__( ): super().__init__( model=model, + platform=platform, tokenizer=tokenizer, processor=processor, image_processor=image_processor, @@ -555,6 +566,7 @@ class AutoRoundDiffusion(DiffusionCompressor): Args: model: The PyTorch model to be quantized. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data, is not used for diffusion models. guidance_scale (float): Control how much the image generation process follows the text prompt. The more it is, the more closely it follows the prompt (default is 7.5). diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py index 4606eab3a..f0dda9155 100644 --- a/auto_round/compressors/adam.py +++ b/auto_round/compressors/adam.py @@ -26,6 +26,7 @@ class AdamCompressor(BaseCompressor): Args: model: The PyTorch model to be quantized. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data. scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations bits (int): Number of bits for quantization (default is 4). @@ -85,6 +86,7 @@ class AdamCompressor(BaseCompressor): def __init__( self, model: Union[torch.nn.Module, str], + platform="hf", tokenizer=None, scheme: Union[str, dict, QuantizationScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, @@ -103,6 +105,7 @@ def __init__( ): super(AdamCompressor, self).__init__( model=model, + platform=platform, tokenizer=tokenizer, scheme=scheme, layer_config=layer_config, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 297f8a50e..6d832f999 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -103,6 +103,7 @@ class BaseCompressor(object): Attributes: model (torch.nn.Module): The loaded PyTorch model in eval mode. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: Tokenizer used to prepare input text for calibration/tuning. bits (int): Weight quantization bits. group_size (int): Per-group size for weight quantization. @@ -127,6 +128,7 @@ class BaseCompressor(object): def __init__( self, model: Union[torch.nn.Module, str], + platform="hf", tokenizer=None, scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, @@ -232,6 +234,7 @@ def __init__( device = kwargs.pop("device", None) # Scale factor for RAM usage per parameter. mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) + self.platform = platform self.quant_lm_head = kwargs.pop("quant_lm_head", False) self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False @@ -263,6 +266,7 @@ def __init__( if isinstance(model, str): model, tokenizer = llm_load_model( model, + platform=platform, device="cpu", # always load cpu first ) elif tokenizer is None and not self.diffusion and iters > 0: diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index cb69e6a88..7d644033d 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -46,6 +46,7 @@ class DiffusionCompressor(BaseCompressor): Args: model: The PyTorch model to be quantized. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data, is not used for diffusion models. guidance_scale (float): Control how much the image generation process follows the text prompt. The more it is, the more closely it follows the prompt (default is 7.5). @@ -80,6 +81,7 @@ class DiffusionCompressor(BaseCompressor): def __init__( self, model: Union[object, str], + platform: str = "hf", tokenizer=None, guidance_scale: float = 7.5, num_inference_steps: int = 50, @@ -110,7 +112,7 @@ def __init__( self._set_device(device_map) if isinstance(model, str): - pipe, model = diffusion_load_model(model, device=self.device) + pipe, model = diffusion_load_model(model, platform=platform, device=self.device) elif isinstance(model, pipeline_utils.DiffusionPipeline): pipe = model model = pipe.transformer @@ -144,6 +146,7 @@ def __init__( kwargs["diffusion"] = True super(DiffusionCompressor, self).__init__( model=model, + platform=platform, tokenizer=None, scheme=scheme, layer_config=layer_config, diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py index b5586a4bb..041a5e785 100644 --- a/auto_round/compressors/mllm/compressor.py +++ b/auto_round/compressors/mllm/compressor.py @@ -86,6 +86,7 @@ class MLLMCompressor(BaseCompressor): Args: model: The PyTorch model to be quantized. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data. processor: Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text, vision and audio). @@ -144,6 +145,7 @@ class MLLMCompressor(BaseCompressor): def __init__( self, model: Union[torch.nn.Module, str], + platform: str = "hf", tokenizer=None, processor=None, image_processor=None, @@ -171,7 +173,7 @@ def __init__( self._set_device(device_map) if isinstance(model, str): - model, processor, tokenizer, image_processor = mllm_load_model(model, device=self.device) + model, processor, tokenizer, image_processor = mllm_load_model(model, platform=platform, device=self.device) self.model = model quant_nontext_module = self._check_quant_nontext(layer_config, quant_nontext_module) @@ -257,6 +259,7 @@ def __init__( kwargs["mllm"] = True super(MLLMCompressor, self).__init__( model=model, + platform=platform, tokenizer=tokenizer, scheme=scheme, layer_config=layer_config, @@ -374,6 +377,7 @@ def calib(self, nsamples, bs): continue try: if isinstance(data_new, torch.Tensor): + data_new = data_new.to(self.model.device) self.model(data_new) elif isinstance(data_new, tuple) or isinstance(data_new, list): self.model(*data_new) diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 6eb43e056..d5a093d1b 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -480,7 +480,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType. from auto_round.export.export_to_gguf.convert import download_convert_file from auto_round.logger import logger - from auto_round.utils.model import download_hf_model, get_gguf_architecture + from auto_round.utils.model import download_or_get_path, get_gguf_architecture formats = sorted(formats, key=lambda x: len(x)) export_gguf = False @@ -505,7 +505,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType. else: model_path = args_or_ar.model.name_or_path if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) + model_path = download_or_get_path(model_path, args_or_ar.platform) model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: logger.warning( @@ -539,7 +539,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType. else: model_path = args_or_ar.model.name_or_path if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) + model_path = download_or_get_path(model_path, args_or_ar.platform) model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT) if model_architecture not in ModelBase._model_classes[ModelType.TEXT]: logger.error(f"Model {model_architecture} is not supported to export gguf format.") diff --git a/auto_round/envs.py b/auto_round/envs.py index ae0d79f1d..9c9c503fd 100644 --- a/auto_round/envs.py +++ b/auto_round/envs.py @@ -18,10 +18,12 @@ if TYPE_CHECKING: AR_LOG_LEVEL: str = "INFO" + MODEL_PLATFORM: str = "HF" environment_variables: dict[str, Callable[[], Any]] = { # this is used for configuring the default logging level "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(), + "MODEL_PLATFORM": lambda: os.getenv("MODEL_PLATFORM", "HF").upper(), } diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 5a7e803c5..0a64c23d2 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -167,11 +167,11 @@ def is_extra_tensor(tensor_name): from safetensors import safe_open from auto_round.export.export_to_gguf.special_handle import get_tensor_from_file - from auto_round.utils import download_hf_model + from auto_round.utils import download_or_get_path dir_path = cls.model.name_or_path if not os.path.isdir(dir_path): - dir_path = download_hf_model(dir_path) + dir_path = download_or_get_path(dir_path) INDEX_FILE = "model.safetensors.index.json" if INDEX_FILE in os.listdir(dir_path): with open(os.path.join(dir_path, INDEX_FILE)) as f: diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 526056263..8633a2a50 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -28,7 +28,7 @@ LazyImport, check_to_quantized, clear_memory, - download_hf_model, + download_or_get_path, flatten_list, get_block_names, get_gguf_architecture, @@ -77,7 +77,7 @@ def create_model_class( tmp_work_dir = model.name_or_path os.makedirs(output_dir, exist_ok=True) if not os.path.isdir(tmp_work_dir): - tmp_work_dir = download_hf_model(tmp_work_dir) + tmp_work_dir = download_or_get_path(tmp_work_dir) with torch.inference_mode(): model_architecture = get_gguf_architecture(tmp_work_dir, model_type=model_type) try: diff --git a/auto_round/export/export_to_gguf/special_handle.py b/auto_round/export/export_to_gguf/special_handle.py index f8a463e9b..cb15af7a1 100644 --- a/auto_round/export/export_to_gguf/special_handle.py +++ b/auto_round/export/export_to_gguf/special_handle.py @@ -20,7 +20,7 @@ from safetensors import safe_open from torch import Tensor -from auto_round.utils import download_hf_model +from auto_round.utils import download_or_get_path def handle_special_model(cls, model_architecture): @@ -32,7 +32,7 @@ def handle_special_model(cls, model_architecture): def get_tensor_from_file(dir_path, tensor_name): if not os.path.isdir(dir_path): - dir_path = download_hf_model(dir_path) + dir_path = download_or_get_path(dir_path) INDEX_FILE = "model.safetensors.index.json" # get filename if INDEX_FILE in os.listdir(dir_path): diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 7a1c66de8..9415d5385 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -148,6 +148,36 @@ def check_start_with_block_name(name: str, block_name_to_quantize: list): return False +def download_or_get_path(repo_id: str, platform: str = None) -> str: + from auto_round.envs import MODEL_PLATFORM + + if platform is None: + platform = MODEL_PLATFORM.lower() + + if platform == "model_scope": + return download_modelscope_model(repo_id) + else: + return download_hf_model(repo_id) + + +def download_modelscope_model(repo_id: str, local_dir: str = None, cache_dir: str = None): + from modelscope.utils.file_utils import get_modelscope_cache_dir + + system_cache = cache_dir if cache_dir is not None else get_modelscope_cache_dir() + if local_dir: + directory = os.path.abspath(local_dir) + elif cache_dir: + directory = os.path.join(system_cache, *repo_id.split("/")) + else: + directory = os.path.join(system_cache, "models", *repo_id.split("/")) + if os.path.exists(directory): + return directory + else: + from modelscope.hub.snapshot_download import snapshot_download + + return snapshot_download(repo_id) + + def download_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None): """Download hugging face model from hf hub.""" from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE @@ -180,13 +210,22 @@ def download_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None): def llm_load_model( - pretrained_model_name_or_path, - trust_remote_code=True, - model_dtype=None, - device="cpu", + pretrained_model_name_or_path: str, + platform: str = "hf", + trust_remote_code: bool = True, + model_dtype: str = None, + device: str = "cpu", **kwargs, ): - from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + assert platform.lower() in [ + "hf", + "model_scope", + ], "current only support hf or model_scope platform to load pretrained model." + os.environ["MODEL_PLATFORM"] = platform.upper() + if platform == "model_scope": + from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer + else: + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer from auto_round.utils.device import ( _use_hpu_compile_mode, @@ -254,16 +293,31 @@ def llm_load_model( def mllm_load_model( - pretrained_model_name_or_path, - device="cpu", - torch_dtype="auto", - use_auto_mapping=True, - trust_remote_code=True, - model_dtype=None, + pretrained_model_name_or_path: str, + platform: str = "hf", + device: str = "cpu", + torch_dtype: str = "auto", + use_auto_mapping: bool = True, + trust_remote_code: bool = True, + model_dtype: str = None, **kwargs, ): - import transformers - from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer + assert platform.lower() in [ + "hf", + "model_scope", + ], "current only support hf or model_scope platform to load pretrained model." + os.environ["MODEL_PLATFORM"] = platform.upper() + + if platform == "model_scope": + import modelscope + from modelscope import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer + + base_lib = modelscope + else: + import transformers + from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer + + base_lib = transformers from auto_round.utils.device import get_device_and_parallelism, set_fake_cuda_device_capability @@ -322,11 +376,11 @@ def mllm_load_model( ) else: if architectures.endswith("Model") and hasattr( - transformers, n := architectures.replace("Model", "ForConditionalGeneration") + base_lib, n := architectures.replace("Model", "ForConditionalGeneration") ): - cls = getattr(transformers, n) - elif hasattr(transformers, architectures): - cls = getattr(transformers, architectures) + cls = getattr(base_lib, n) + elif hasattr(base_lib, architectures): + cls = getattr(base_lib, architectures) else: cls = AutoModelForCausalLM try: @@ -365,7 +419,10 @@ def mllm_load_model( pretrained_model_name_or_path, trust_remote_code=trust_remote_code ) try: - from transformers import AutoImageProcessor + if platform == "model_scope": + from modelscope import AutoImageProcessor + else: + from transformers import AutoImageProcessor image_processor = AutoImageProcessor.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code @@ -382,6 +439,7 @@ def mllm_load_model( def diffusion_load_model( pretrained_model_name_or_path: str, + platform: str = "hf", device: Union[str, torch.device] = "cpu", torch_dtype: Union[str, torch.dtype] = "auto", use_auto_mapping: bool = False, @@ -392,6 +450,11 @@ def diffusion_load_model( from auto_round.utils.common import LazyImport from auto_round.utils.device import get_device_and_parallelism + if platform != "hf": + raise NotImplementedError( + f"auto_round current only support hf as platform for diffusion model, but get {platform}" + ) + device_str, use_auto_mapping = get_device_and_parallelism(device) torch_dtype = "auto" if device_str is not None and "hpu" in device_str: @@ -425,7 +488,7 @@ def is_pure_text_model(model): return True -def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): +def is_mllm_model(model_or_path: Union[str, torch.nn.Module], platform: str = None): MM_KEYS = [ "multi_modal_projector", "vision_tower", @@ -446,7 +509,7 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path if not os.path.isdir(model_path): - model_path = download_hf_model(model_path) + model_path = download_or_get_path(model_path, platform=platform) if isinstance(model_path, str): if os.path.exists(os.path.join(model_path, "preprocessor_config.json")): From 415107ae9348472fb5a8620f891fca98af884e2c Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 29 Oct 2025 01:52:14 -0400 Subject: [PATCH 2/6] add ut Signed-off-by: n1ck-guo --- test/test_cpu/test_model_scope.py | 51 +++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 test/test_cpu/test_model_scope.py diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py new file mode 100644 index 000000000..0663bcf12 --- /dev/null +++ b/test/test_cpu/test_model_scope.py @@ -0,0 +1,51 @@ +import copy +import shutil +import sys +import unittest + +sys.path.insert(0, "../..") + +import torch + +from auto_round import AutoRound + + +class LLMDataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(3): + yield torch.ones([1, 10], dtype=torch.long) + + +class TestModelScope(unittest.TestCase): + @classmethod + def setUpClass(self): + self.saved_path = "./saved" + self.dataset = LLMDataLoader() + + @classmethod + def tearDownClass(self): + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + return super().tearDownClass() + + def test_llm(self): + model_name = "Qwen/Qwen2.5-0.5B-Instruct" + autoround = AutoRound( + model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset + ) + autoround.quantize_and_save() + + def test_mllm(self): + model_name = "Qwen/Qwen2-VL-2B-Instruct" + autoround = AutoRound( + model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2 + ) + autoround.quantize_and_save(self.saved_path) + + +if __name__ == "__main__": + unittest.main() From 65ef85cb4214224bf8d2fffe059b6cda51cdae2f Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 29 Oct 2025 02:25:17 -0400 Subject: [PATCH 3/6] env arg first Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 6d832f999..75448995f 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -234,6 +234,9 @@ def __init__( device = kwargs.pop("device", None) # Scale factor for RAM usage per parameter. mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) + + if os.getenv("MODEL_PLATFORM"): + platform = os.getenv("MODEL_PLATFORM").lower() self.platform = platform self.quant_lm_head = kwargs.pop("quant_lm_head", False) self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False From c40431722eac5aa522a63f6e0cb03bc57523344f Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 29 Oct 2025 20:17:54 -0400 Subject: [PATCH 4/6] update Signed-off-by: n1ck-guo --- auto_round/autoround.py | 24 +++++++++---------- auto_round/compressors/adam.py | 6 ++--- auto_round/compressors/base.py | 8 +++---- .../compressors/diffusion/compressor.py | 6 ++--- auto_round/compressors/mllm/compressor.py | 6 ++--- auto_round/envs.py | 4 ++-- auto_round/utils/model.py | 11 +++++---- 7 files changed, 34 insertions(+), 31 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 7e717f3a9..c43b2e425 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -39,8 +39,8 @@ class AutoRound: Attributes: model (torch.nn.Module): The loaded PyTorch model in eval mode. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: Tokenizer used to prepare input text for calibration/tuning. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] bits (int): Weight quantization bits. group_size (int): Per-group size for weight quantization. sym (bool): Whether to use symmetric weight quantization. @@ -64,8 +64,8 @@ class AutoRound: def __new__( cls, model: Union[torch.nn.Module, str], - platform: str = "hf", tokenizer=None, + platform: str = "hf", scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", @@ -168,8 +168,8 @@ def __new__( kwargs.update(extra_config.to_dict()) ar = dynamic_compressor( model=model, - platform=platform, tokenizer=tokenizer, + platform=platform, scheme=scheme, layer_config=layer_config, dataset=dataset, @@ -313,8 +313,8 @@ class AutoRoundLLM(LLMCompressor): def __init__( self, model: Union[torch.nn.Module, str], - platform: str = "hf", tokenizer=None, + platform: str = "hf", scheme: Union[str, dict, QuantizationScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", @@ -331,8 +331,8 @@ def __init__( ): super().__init__( model=model, - platform=platform, tokenizer=tokenizer, + platform=platform, scheme=scheme, layer_config=layer_config, dataset=dataset, @@ -355,8 +355,8 @@ class AutoRoundAdam(AdamCompressor): Args: model: The PyTorch model to be quantized. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations bits (int): Number of bits for quantization (default is 4). group_size (int): Size of the quantization group (default is 128). @@ -415,8 +415,8 @@ class AutoRoundAdam(AdamCompressor): def __init__( self, model: Union[torch.nn.Module, str], - platform: str = "hf", tokenizer=None, + platform: str = "hf", scheme: Union[str, dict, QuantizationScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", @@ -434,8 +434,8 @@ def __init__( ): super().__init__( model=model, - platform=platform, tokenizer=tokenizer, + platform=platform, scheme=scheme, layer_config=layer_config, batch_size=batch_size, @@ -459,8 +459,8 @@ class AutoRoundMLLM(MLLMCompressor): Args: model: The PyTorch model to be quantized. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] processor: Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text, vision and audio). image_processor: Image processor for special model like llava. @@ -518,8 +518,8 @@ class AutoRoundMLLM(MLLMCompressor): def __init__( self, model: Union[torch.nn.Module, str], - platform: str = "hf", tokenizer=None, + platform: str = "hf", processor=None, image_processor=None, scheme: Union[str, dict, QuantizationScheme] = "W4A16", @@ -539,8 +539,8 @@ def __init__( ): super().__init__( model=model, - platform=platform, tokenizer=tokenizer, + platform=platform, processor=processor, image_processor=image_processor, scheme=scheme, @@ -566,8 +566,8 @@ class AutoRoundDiffusion(DiffusionCompressor): Args: model: The PyTorch model to be quantized. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data, is not used for diffusion models. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] guidance_scale (float): Control how much the image generation process follows the text prompt. The more it is, the more closely it follows the prompt (default is 7.5). num_inference_steps (int): The reference number of denoising steps (default is 50). diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py index f0dda9155..fb79cf39a 100644 --- a/auto_round/compressors/adam.py +++ b/auto_round/compressors/adam.py @@ -26,8 +26,8 @@ class AdamCompressor(BaseCompressor): Args: model: The PyTorch model to be quantized. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations bits (int): Number of bits for quantization (default is 4). group_size (int): Size of the quantization group (default is 128). @@ -86,8 +86,8 @@ class AdamCompressor(BaseCompressor): def __init__( self, model: Union[torch.nn.Module, str], - platform="hf", tokenizer=None, + platform="hf", scheme: Union[str, dict, QuantizationScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", @@ -105,8 +105,8 @@ def __init__( ): super(AdamCompressor, self).__init__( model=model, - platform=platform, tokenizer=tokenizer, + platform=platform, scheme=scheme, layer_config=layer_config, batch_size=batch_size, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 75448995f..1bed3255c 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -103,8 +103,8 @@ class BaseCompressor(object): Attributes: model (torch.nn.Module): The loaded PyTorch model in eval mode. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: Tokenizer used to prepare input text for calibration/tuning. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] bits (int): Weight quantization bits. group_size (int): Per-group size for weight quantization. sym (bool): Whether to use symmetric weight quantization. @@ -128,8 +128,8 @@ class BaseCompressor(object): def __init__( self, model: Union[torch.nn.Module, str], - platform="hf", tokenizer=None, + platform="hf", scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16", layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", @@ -235,8 +235,8 @@ def __init__( # Scale factor for RAM usage per parameter. mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) - if os.getenv("MODEL_PLATFORM"): - platform = os.getenv("MODEL_PLATFORM").lower() + if os.getenv("AUTOROUND_USE_MODELSCOPE", False): + platform = "model_scope" self.platform = platform self.quant_lm_head = kwargs.pop("quant_lm_head", False) self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 7d644033d..904767602 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -46,8 +46,8 @@ class DiffusionCompressor(BaseCompressor): Args: model: The PyTorch model to be quantized. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data, is not used for diffusion models. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] guidance_scale (float): Control how much the image generation process follows the text prompt. The more it is, the more closely it follows the prompt (default is 7.5). num_inference_steps (int): The reference number of denoising steps (default is 50). @@ -81,8 +81,8 @@ class DiffusionCompressor(BaseCompressor): def __init__( self, model: Union[object, str], - platform: str = "hf", tokenizer=None, + platform: str = "hf", guidance_scale: float = 7.5, num_inference_steps: int = 50, generator_seed: int = None, @@ -146,8 +146,8 @@ def __init__( kwargs["diffusion"] = True super(DiffusionCompressor, self).__init__( model=model, - platform=platform, tokenizer=None, + platform=platform, scheme=scheme, layer_config=layer_config, dataset=dataset, diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py index 041a5e785..2cfa457b7 100644 --- a/auto_round/compressors/mllm/compressor.py +++ b/auto_round/compressors/mllm/compressor.py @@ -86,8 +86,8 @@ class MLLMCompressor(BaseCompressor): Args: model: The PyTorch model to be quantized. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] tokenizer: An optional tokenizer for processing input data. + platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] processor: Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text, vision and audio). image_processor: Image processor for special model like llava. @@ -145,8 +145,8 @@ class MLLMCompressor(BaseCompressor): def __init__( self, model: Union[torch.nn.Module, str], - platform: str = "hf", tokenizer=None, + platform: str = "hf", processor=None, image_processor=None, scheme: Union[str, dict, QuantizationScheme] = "W4A16", @@ -259,8 +259,8 @@ def __init__( kwargs["mllm"] = True super(MLLMCompressor, self).__init__( model=model, - platform=platform, tokenizer=tokenizer, + platform=platform, scheme=scheme, layer_config=layer_config, dataset=dataset, diff --git a/auto_round/envs.py b/auto_round/envs.py index 9c9c503fd..43351d503 100644 --- a/auto_round/envs.py +++ b/auto_round/envs.py @@ -18,12 +18,12 @@ if TYPE_CHECKING: AR_LOG_LEVEL: str = "INFO" - MODEL_PLATFORM: str = "HF" + AUTOROUND_USE_MODELSCOPE: bool = "False" environment_variables: dict[str, Callable[[], Any]] = { # this is used for configuring the default logging level "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(), - "MODEL_PLATFORM": lambda: os.getenv("MODEL_PLATFORM", "HF").upper(), + "AUTOROUND_USE_MODELSCOPE": lambda: os.getenv("AUTOROUND_USE_MODELSCOPE ", "False").lower() in ["1", "true"], } diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 9415d5385..e2998196b 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -149,10 +149,13 @@ def check_start_with_block_name(name: str, block_name_to_quantize: list): def download_or_get_path(repo_id: str, platform: str = None) -> str: - from auto_round.envs import MODEL_PLATFORM + from auto_round.envs import AUTOROUND_USE_MODELSCOPE if platform is None: - platform = MODEL_PLATFORM.lower() + if AUTOROUND_USE_MODELSCOPE: + platform = "model_scope" + else: + platform = "hf" if platform == "model_scope": return download_modelscope_model(repo_id) @@ -221,7 +224,7 @@ def llm_load_model( "hf", "model_scope", ], "current only support hf or model_scope platform to load pretrained model." - os.environ["MODEL_PLATFORM"] = platform.upper() + os.environ["AUTOROUND_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf" if platform == "model_scope": from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer else: @@ -306,7 +309,7 @@ def mllm_load_model( "hf", "model_scope", ], "current only support hf or model_scope platform to load pretrained model." - os.environ["MODEL_PLATFORM"] = platform.upper() + os.environ["AUTOROUND_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf" if platform == "model_scope": import modelscope From adf3955e8b1aef964fa2764f547bd8181f0ca2b1 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 29 Oct 2025 22:56:15 -0400 Subject: [PATCH 5/6] update Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 2 +- auto_round/envs.py | 4 ++-- auto_round/utils/model.py | 20 ++++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 1bed3255c..19a191f48 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -235,7 +235,7 @@ def __init__( # Scale factor for RAM usage per parameter. mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) - if os.getenv("AUTOROUND_USE_MODELSCOPE", False): + if os.getenv("AR_USE_MODELSCOPE", False): platform = "model_scope" self.platform = platform self.quant_lm_head = kwargs.pop("quant_lm_head", False) diff --git a/auto_round/envs.py b/auto_round/envs.py index 43351d503..6c09a2aad 100644 --- a/auto_round/envs.py +++ b/auto_round/envs.py @@ -18,12 +18,12 @@ if TYPE_CHECKING: AR_LOG_LEVEL: str = "INFO" - AUTOROUND_USE_MODELSCOPE: bool = "False" + AR_USE_MODELSCOPE: bool = "False" environment_variables: dict[str, Callable[[], Any]] = { # this is used for configuring the default logging level "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(), - "AUTOROUND_USE_MODELSCOPE": lambda: os.getenv("AUTOROUND_USE_MODELSCOPE ", "False").lower() in ["1", "true"], + "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE ", "False").lower() in ["1", "true"], } diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index e2998196b..9bd29dabf 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -149,10 +149,10 @@ def check_start_with_block_name(name: str, block_name_to_quantize: list): def download_or_get_path(repo_id: str, platform: str = None) -> str: - from auto_round.envs import AUTOROUND_USE_MODELSCOPE + from auto_round.envs import AR_USE_MODELSCOPE if platform is None: - if AUTOROUND_USE_MODELSCOPE: + if AR_USE_MODELSCOPE: platform = "model_scope" else: platform = "hf" @@ -164,7 +164,7 @@ def download_or_get_path(repo_id: str, platform: str = None) -> str: def download_modelscope_model(repo_id: str, local_dir: str = None, cache_dir: str = None): - from modelscope.utils.file_utils import get_modelscope_cache_dir + from modelscope.utils.file_utils import get_modelscope_cache_dir # pylint: disable=E0401 system_cache = cache_dir if cache_dir is not None else get_modelscope_cache_dir() if local_dir: @@ -176,7 +176,7 @@ def download_modelscope_model(repo_id: str, local_dir: str = None, cache_dir: st if os.path.exists(directory): return directory else: - from modelscope.hub.snapshot_download import snapshot_download + from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=E0401 return snapshot_download(repo_id) @@ -224,9 +224,9 @@ def llm_load_model( "hf", "model_scope", ], "current only support hf or model_scope platform to load pretrained model." - os.environ["AUTOROUND_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf" + os.environ["AR_USE_MODELSCOPE"] = "True" if platform.lower() == "model_scope" else "False" if platform == "model_scope": - from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer + from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer # pylint: disable=E0401 else: from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer @@ -309,11 +309,11 @@ def mllm_load_model( "hf", "model_scope", ], "current only support hf or model_scope platform to load pretrained model." - os.environ["AUTOROUND_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf" + os.environ["AR_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf" if platform == "model_scope": - import modelscope - from modelscope import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer + import modelscope # pylint: disable=E0401 + from modelscope import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer # pylint: disable=E0401 base_lib = modelscope else: @@ -423,7 +423,7 @@ def mllm_load_model( ) try: if platform == "model_scope": - from modelscope import AutoImageProcessor + from modelscope import AutoImageProcessor # pylint: disable=E0401 else: from transformers import AutoImageProcessor From db96ade76f819592f64a5b6dcb8b08d56dfea65c Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 29 Oct 2025 23:34:28 -0400 Subject: [PATCH 6/6] update Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 3 ++- auto_round/envs.py | 29 ++++++++++++++++++++++++++++- auto_round/utils/model.py | 11 +++++++---- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 19a191f48..997dc64f3 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -30,6 +30,7 @@ from tqdm import tqdm from transformers import set_seed +from auto_round import envs from auto_round.compressors.utils import ( block_forward, check_need_act_calibration, @@ -235,7 +236,7 @@ def __init__( # Scale factor for RAM usage per parameter. mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) - if os.getenv("AR_USE_MODELSCOPE", False): + if envs.AR_USE_MODELSCOPE: platform = "model_scope" self.platform = platform self.quant_lm_head = kwargs.pop("quant_lm_head", False) diff --git a/auto_round/envs.py b/auto_round/envs.py index 6c09a2aad..3d9b0dfa3 100644 --- a/auto_round/envs.py +++ b/auto_round/envs.py @@ -23,7 +23,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # this is used for configuring the default logging level "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(), - "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE ", "False").lower() in ["1", "true"], + "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"], } @@ -43,3 +43,30 @@ def is_set(name: str): if name in environment_variables: return name in os.environ raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def set_config(**kwargs): + """ + Set configuration values for environment variables. + + Args: + **kwargs: Keyword arguments where keys are environment variable names + and values are the desired values to set. + + Example: + set_config(AR_LOG_LEVEL="DEBUG", AR_USE_MODELSCOPE=True) + """ + for key, value in kwargs.items(): + if key in environment_variables: + # Convert value to appropriate string format + if key == "AR_USE_MODELSCOPE": + # Handle boolean values for AR_USE_MODELSCOPE + str_value = "true" if value in [True, "True", "true", "1", 1] else "false" + else: + # For other variables, convert to string + str_value = str(value) + + # Set the environment variable + os.environ[key] = str_value + else: + raise AttributeError(f"module {__name__!r} has no attribute {key!r}") diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 9bd29dabf..427ec7f9c 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -23,6 +23,7 @@ import torch import transformers +from auto_round import envs from auto_round.export.export_to_gguf.config import ModelType from auto_round.logger import logger from auto_round.schemes import QuantizationScheme @@ -149,10 +150,10 @@ def check_start_with_block_name(name: str, block_name_to_quantize: list): def download_or_get_path(repo_id: str, platform: str = None) -> str: - from auto_round.envs import AR_USE_MODELSCOPE + from auto_round import envs if platform is None: - if AR_USE_MODELSCOPE: + if envs.AR_USE_MODELSCOPE: platform = "model_scope" else: platform = "hf" @@ -224,7 +225,8 @@ def llm_load_model( "hf", "model_scope", ], "current only support hf or model_scope platform to load pretrained model." - os.environ["AR_USE_MODELSCOPE"] = "True" if platform.lower() == "model_scope" else "False" + if platform.lower() == "model_scope" and not envs.AR_USE_MODELSCOPE: + envs.set_config(AR_USE_MODELSCOPE=True) if platform == "model_scope": from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer # pylint: disable=E0401 else: @@ -309,7 +311,8 @@ def mllm_load_model( "hf", "model_scope", ], "current only support hf or model_scope platform to load pretrained model." - os.environ["AR_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf" + if platform.lower() == "model_scope" and not envs.AR_USE_MODELSCOPE: + envs.set_config(AR_USE_MODELSCOPE=True) if platform == "model_scope": import modelscope # pylint: disable=E0401