From bd3824cd75c582759cffa6fe45da7fa937d97b9b Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 29 Oct 2025 01:51:29 -0400
Subject: [PATCH 1/6] support for model scope

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/__main__.py                        |   7 ++
 auto_round/autoround.py                       |  14 ++-
 auto_round/compressors/adam.py                |   3 +
 auto_round/compressors/base.py                |   4 +
 .../compressors/diffusion/compressor.py       |   5 +-
 auto_round/compressors/mllm/compressor.py     |   6 +-
 auto_round/compressors/utils.py               |   6 +-
 auto_round/envs.py                            |   2 +
 auto_round/export/export_to_gguf/convert.py   |   4 +-
 auto_round/export/export_to_gguf/export.py    |   4 +-
 .../export/export_to_gguf/special_handle.py   |   4 +-
 auto_round/utils/model.py                     | 103 ++++++++++++++----
 12 files changed, 130 insertions(+), 32 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index c403ee863..6a30d0233 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -44,6 +44,12 @@ def __init__(self, *args, **kwargs):
             help="Path to the pre-trained model or model identifier from huggingface.co/models. "
             "Examples: 'facebook/opt-125m', 'bert-base-uncased', or local path like '/path/to/model'",
         )
+        basic.add_argument(
+            "--platform",
+            default="hf",
+            help="Platform to load the pre-trained model. Options: [hf, model_scope]."
+            " hf stands for huggingface and model_scope stands for model scope.",
+        )
         basic.add_argument(
             "--scheme",
             default="W4A16",
@@ -565,6 +571,7 @@ def tune(args):
 
     autoround: BaseCompressor = AutoRound(
         model=model_name,
+        platform=args.platform,
         scheme=scheme,
         dataset=args.dataset,
         iters=args.iters,
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index a78c94737..7e717f3a9 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -39,6 +39,7 @@ class AutoRound:
 
     Attributes:
         model (torch.nn.Module): The loaded PyTorch model in eval mode.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: Tokenizer used to prepare input text for calibration/tuning.
         bits (int): Weight quantization bits.
         group_size (int): Per-group size for weight quantization.
@@ -63,6 +64,7 @@ class AutoRound:
     def __new__(
         cls,
         model: Union[torch.nn.Module, str],
+        platform: str = "hf",
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
@@ -143,7 +145,7 @@ def __new__(
         """
         model_cls = []
 
-        if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model):
+        if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform):
             logger.info("using MLLM mode for multimodal model.")
             model_cls.append(MLLMCompressor)
             if extra_config:
@@ -166,6 +168,7 @@ def __new__(
             kwargs.update(extra_config.to_dict())
         ar = dynamic_compressor(
             model=model,
+            platform=platform,
             tokenizer=tokenizer,
             scheme=scheme,
             layer_config=layer_config,
@@ -310,6 +313,7 @@ class AutoRoundLLM(LLMCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
+        platform: str = "hf",
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
@@ -327,6 +331,7 @@ def __init__(
     ):
         super().__init__(
             model=model,
+            platform=platform,
             tokenizer=tokenizer,
             scheme=scheme,
             layer_config=layer_config,
@@ -350,6 +355,7 @@ class AutoRoundAdam(AdamCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data.
         scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
         bits (int): Number of bits for quantization (default is 4).
@@ -409,6 +415,7 @@ class AutoRoundAdam(AdamCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
+        platform: str = "hf",
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
@@ -427,6 +434,7 @@ def __init__(
     ):
         super().__init__(
             model=model,
+            platform=platform,
             tokenizer=tokenizer,
             scheme=scheme,
             layer_config=layer_config,
@@ -451,6 +459,7 @@ class AutoRoundMLLM(MLLMCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data.
         processor: Any multi-modal model will require an object to encode or
                    decode the data that groups several modalities (among text, vision and audio).
@@ -509,6 +518,7 @@ class AutoRoundMLLM(MLLMCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
+        platform: str = "hf",
         tokenizer=None,
         processor=None,
         image_processor=None,
@@ -529,6 +539,7 @@ def __init__(
     ):
         super().__init__(
             model=model,
+            platform=platform,
             tokenizer=tokenizer,
             processor=processor,
             image_processor=image_processor,
@@ -555,6 +566,7 @@ class AutoRoundDiffusion(DiffusionCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
         guidance_scale (float): Control how much the image generation process follows the text prompt.
                                 The more it is, the more closely it follows the prompt (default is 7.5).
diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py
index 4606eab3a..f0dda9155 100644
--- a/auto_round/compressors/adam.py
+++ b/auto_round/compressors/adam.py
@@ -26,6 +26,7 @@ class AdamCompressor(BaseCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data.
         scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
         bits (int): Number of bits for quantization (default is 4).
@@ -85,6 +86,7 @@ class AdamCompressor(BaseCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
+        platform="hf",
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
@@ -103,6 +105,7 @@ def __init__(
     ):
         super(AdamCompressor, self).__init__(
             model=model,
+            platform=platform,
             tokenizer=tokenizer,
             scheme=scheme,
             layer_config=layer_config,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 297f8a50e..6d832f999 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -103,6 +103,7 @@ class BaseCompressor(object):
 
     Attributes:
         model (torch.nn.Module): The loaded PyTorch model in eval mode.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: Tokenizer used to prepare input text for calibration/tuning.
         bits (int): Weight quantization bits.
         group_size (int): Per-group size for weight quantization.
@@ -127,6 +128,7 @@ class BaseCompressor(object):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
+        platform="hf",
         tokenizer=None,
         scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
@@ -232,6 +234,7 @@ def __init__(
         device = kwargs.pop("device", None)
         # Scale factor for RAM usage per parameter.
         mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
+        self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
         self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
@@ -263,6 +266,7 @@ def __init__(
         if isinstance(model, str):
             model, tokenizer = llm_load_model(
                 model,
+                platform=platform,
                 device="cpu",  # always load cpu first
             )
         elif tokenizer is None and not self.diffusion and iters > 0:
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index cb69e6a88..7d644033d 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -46,6 +46,7 @@ class DiffusionCompressor(BaseCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
         guidance_scale (float): Control how much the image generation process follows the text prompt.
                                 The more it is, the more closely it follows the prompt (default is 7.5).
@@ -80,6 +81,7 @@ class DiffusionCompressor(BaseCompressor):
     def __init__(
         self,
         model: Union[object, str],
+        platform: str = "hf",
         tokenizer=None,
         guidance_scale: float = 7.5,
         num_inference_steps: int = 50,
@@ -110,7 +112,7 @@ def __init__(
         self._set_device(device_map)
 
         if isinstance(model, str):
-            pipe, model = diffusion_load_model(model, device=self.device)
+            pipe, model = diffusion_load_model(model, platform=platform, device=self.device)
         elif isinstance(model, pipeline_utils.DiffusionPipeline):
             pipe = model
             model = pipe.transformer
@@ -144,6 +146,7 @@ def __init__(
         kwargs["diffusion"] = True
         super(DiffusionCompressor, self).__init__(
             model=model,
+            platform=platform,
             tokenizer=None,
             scheme=scheme,
             layer_config=layer_config,
diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py
index b5586a4bb..041a5e785 100644
--- a/auto_round/compressors/mllm/compressor.py
+++ b/auto_round/compressors/mllm/compressor.py
@@ -86,6 +86,7 @@ class MLLMCompressor(BaseCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data.
         processor: Any multi-modal model will require an object to encode or
                    decode the data that groups several modalities (among text, vision and audio).
@@ -144,6 +145,7 @@ class MLLMCompressor(BaseCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
+        platform: str = "hf",
         tokenizer=None,
         processor=None,
         image_processor=None,
@@ -171,7 +173,7 @@ def __init__(
         self._set_device(device_map)
 
         if isinstance(model, str):
-            model, processor, tokenizer, image_processor = mllm_load_model(model, device=self.device)
+            model, processor, tokenizer, image_processor = mllm_load_model(model, platform=platform, device=self.device)
 
         self.model = model
         quant_nontext_module = self._check_quant_nontext(layer_config, quant_nontext_module)
@@ -257,6 +259,7 @@ def __init__(
         kwargs["mllm"] = True
         super(MLLMCompressor, self).__init__(
             model=model,
+            platform=platform,
             tokenizer=tokenizer,
             scheme=scheme,
             layer_config=layer_config,
@@ -374,6 +377,7 @@ def calib(self, nsamples, bs):
                     continue
                 try:
                     if isinstance(data_new, torch.Tensor):
+                        data_new = data_new.to(self.model.device)
                         self.model(data_new)
                     elif isinstance(data_new, tuple) or isinstance(data_new, list):
                         self.model(*data_new)
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
index 6eb43e056..d5a093d1b 100644
--- a/auto_round/compressors/utils.py
+++ b/auto_round/compressors/utils.py
@@ -480,7 +480,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.
 
     from auto_round.export.export_to_gguf.convert import download_convert_file
     from auto_round.logger import logger
-    from auto_round.utils.model import download_hf_model, get_gguf_architecture
+    from auto_round.utils.model import download_or_get_path, get_gguf_architecture
 
     formats = sorted(formats, key=lambda x: len(x))
     export_gguf = False
@@ -505,7 +505,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.
             else:
                 model_path = args_or_ar.model.name_or_path
             if not os.path.isdir(model_path):
-                model_path = download_hf_model(model_path)
+                model_path = download_or_get_path(model_path, args_or_ar.platform)
             model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT)
             if model_architecture not in ModelBase._model_classes[ModelType.TEXT]:
                 logger.warning(
@@ -539,7 +539,7 @@ def gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.
         else:
             model_path = args_or_ar.model.name_or_path
         if not os.path.isdir(model_path):
-            model_path = download_hf_model(model_path)
+            model_path = download_or_get_path(model_path, args_or_ar.platform)
         model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT)
         if model_architecture not in ModelBase._model_classes[ModelType.TEXT]:
             logger.error(f"Model {model_architecture} is not supported to export gguf format.")
diff --git a/auto_round/envs.py b/auto_round/envs.py
index ae0d79f1d..9c9c503fd 100644
--- a/auto_round/envs.py
+++ b/auto_round/envs.py
@@ -18,10 +18,12 @@
 
 if TYPE_CHECKING:
     AR_LOG_LEVEL: str = "INFO"
+    MODEL_PLATFORM: str = "HF"
 
 environment_variables: dict[str, Callable[[], Any]] = {
     # this is used for configuring the default logging level
     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
+    "MODEL_PLATFORM": lambda: os.getenv("MODEL_PLATFORM", "HF").upper(),
 }
 
 
diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
index 5a7e803c5..0a64c23d2 100644
--- a/auto_round/export/export_to_gguf/convert.py
+++ b/auto_round/export/export_to_gguf/convert.py
@@ -167,11 +167,11 @@ def is_extra_tensor(tensor_name):
         from safetensors import safe_open
 
         from auto_round.export.export_to_gguf.special_handle import get_tensor_from_file
-        from auto_round.utils import download_hf_model
+        from auto_round.utils import download_or_get_path
 
         dir_path = cls.model.name_or_path
         if not os.path.isdir(dir_path):
-            dir_path = download_hf_model(dir_path)
+            dir_path = download_or_get_path(dir_path)
         INDEX_FILE = "model.safetensors.index.json"
         if INDEX_FILE in os.listdir(dir_path):
             with open(os.path.join(dir_path, INDEX_FILE)) as f:
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
index 526056263..8633a2a50 100644
--- a/auto_round/export/export_to_gguf/export.py
+++ b/auto_round/export/export_to_gguf/export.py
@@ -28,7 +28,7 @@
     LazyImport,
     check_to_quantized,
     clear_memory,
-    download_hf_model,
+    download_or_get_path,
     flatten_list,
     get_block_names,
     get_gguf_architecture,
@@ -77,7 +77,7 @@ def create_model_class(
     tmp_work_dir = model.name_or_path
     os.makedirs(output_dir, exist_ok=True)
     if not os.path.isdir(tmp_work_dir):
-        tmp_work_dir = download_hf_model(tmp_work_dir)
+        tmp_work_dir = download_or_get_path(tmp_work_dir)
     with torch.inference_mode():
         model_architecture = get_gguf_architecture(tmp_work_dir, model_type=model_type)
         try:
diff --git a/auto_round/export/export_to_gguf/special_handle.py b/auto_round/export/export_to_gguf/special_handle.py
index f8a463e9b..cb15af7a1 100644
--- a/auto_round/export/export_to_gguf/special_handle.py
+++ b/auto_round/export/export_to_gguf/special_handle.py
@@ -20,7 +20,7 @@
 from safetensors import safe_open
 from torch import Tensor
 
-from auto_round.utils import download_hf_model
+from auto_round.utils import download_or_get_path
 
 
 def handle_special_model(cls, model_architecture):
@@ -32,7 +32,7 @@ def handle_special_model(cls, model_architecture):
 
 def get_tensor_from_file(dir_path, tensor_name):
     if not os.path.isdir(dir_path):
-        dir_path = download_hf_model(dir_path)
+        dir_path = download_or_get_path(dir_path)
     INDEX_FILE = "model.safetensors.index.json"
     # get filename
     if INDEX_FILE in os.listdir(dir_path):
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 7a1c66de8..9415d5385 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -148,6 +148,36 @@ def check_start_with_block_name(name: str, block_name_to_quantize: list):
     return False
 
 
+def download_or_get_path(repo_id: str, platform: str = None) -> str:
+    from auto_round.envs import MODEL_PLATFORM
+
+    if platform is None:
+        platform = MODEL_PLATFORM.lower()
+
+    if platform == "model_scope":
+        return download_modelscope_model(repo_id)
+    else:
+        return download_hf_model(repo_id)
+
+
+def download_modelscope_model(repo_id: str, local_dir: str = None, cache_dir: str = None):
+    from modelscope.utils.file_utils import get_modelscope_cache_dir
+
+    system_cache = cache_dir if cache_dir is not None else get_modelscope_cache_dir()
+    if local_dir:
+        directory = os.path.abspath(local_dir)
+    elif cache_dir:
+        directory = os.path.join(system_cache, *repo_id.split("/"))
+    else:
+        directory = os.path.join(system_cache, "models", *repo_id.split("/"))
+    if os.path.exists(directory):
+        return directory
+    else:
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        return snapshot_download(repo_id)
+
+
 def download_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):
     """Download hugging face model from hf hub."""
     from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE
@@ -180,13 +210,22 @@ def download_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):
 
 
 def llm_load_model(
-    pretrained_model_name_or_path,
-    trust_remote_code=True,
-    model_dtype=None,
-    device="cpu",
+    pretrained_model_name_or_path: str,
+    platform: str = "hf",
+    trust_remote_code: bool = True,
+    model_dtype: str = None,
+    device: str = "cpu",
     **kwargs,
 ):
-    from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+    assert platform.lower() in [
+        "hf",
+        "model_scope",
+    ], "current only support hf or model_scope platform to load pretrained model."
+    os.environ["MODEL_PLATFORM"] = platform.upper()
+    if platform == "model_scope":
+        from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer
+    else:
+        from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
 
     from auto_round.utils.device import (
         _use_hpu_compile_mode,
@@ -254,16 +293,31 @@ def llm_load_model(
 
 
 def mllm_load_model(
-    pretrained_model_name_or_path,
-    device="cpu",
-    torch_dtype="auto",
-    use_auto_mapping=True,
-    trust_remote_code=True,
-    model_dtype=None,
+    pretrained_model_name_or_path: str,
+    platform: str = "hf",
+    device: str = "cpu",
+    torch_dtype: str = "auto",
+    use_auto_mapping: bool = True,
+    trust_remote_code: bool = True,
+    model_dtype: str = None,
     **kwargs,
 ):
-    import transformers
-    from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+    assert platform.lower() in [
+        "hf",
+        "model_scope",
+    ], "current only support hf or model_scope platform to load pretrained model."
+    os.environ["MODEL_PLATFORM"] = platform.upper()
+
+    if platform == "model_scope":
+        import modelscope
+        from modelscope import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+
+        base_lib = modelscope
+    else:
+        import transformers
+        from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+
+        base_lib = transformers
 
     from auto_round.utils.device import get_device_and_parallelism, set_fake_cuda_device_capability
 
@@ -322,11 +376,11 @@ def mllm_load_model(
             )
         else:
             if architectures.endswith("Model") and hasattr(
-                transformers, n := architectures.replace("Model", "ForConditionalGeneration")
+                base_lib, n := architectures.replace("Model", "ForConditionalGeneration")
             ):
-                cls = getattr(transformers, n)
-            elif hasattr(transformers, architectures):
-                cls = getattr(transformers, architectures)
+                cls = getattr(base_lib, n)
+            elif hasattr(base_lib, architectures):
+                cls = getattr(base_lib, architectures)
             else:
                 cls = AutoModelForCausalLM
             try:
@@ -365,7 +419,10 @@ def mllm_load_model(
                     pretrained_model_name_or_path, trust_remote_code=trust_remote_code
                 )
             try:
-                from transformers import AutoImageProcessor
+                if platform == "model_scope":
+                    from modelscope import AutoImageProcessor
+                else:
+                    from transformers import AutoImageProcessor
 
                 image_processor = AutoImageProcessor.from_pretrained(
                     pretrained_model_name_or_path, trust_remote_code=trust_remote_code
@@ -382,6 +439,7 @@ def mllm_load_model(
 
 def diffusion_load_model(
     pretrained_model_name_or_path: str,
+    platform: str = "hf",
     device: Union[str, torch.device] = "cpu",
     torch_dtype: Union[str, torch.dtype] = "auto",
     use_auto_mapping: bool = False,
@@ -392,6 +450,11 @@ def diffusion_load_model(
     from auto_round.utils.common import LazyImport
     from auto_round.utils.device import get_device_and_parallelism
 
+    if platform != "hf":
+        raise NotImplementedError(
+            f"auto_round current only support hf as platform for diffusion model, but get {platform}"
+        )
+
     device_str, use_auto_mapping = get_device_and_parallelism(device)
     torch_dtype = "auto"
     if device_str is not None and "hpu" in device_str:
@@ -425,7 +488,7 @@ def is_pure_text_model(model):
     return True
 
 
-def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
+def is_mllm_model(model_or_path: Union[str, torch.nn.Module], platform: str = None):
     MM_KEYS = [
         "multi_modal_projector",
         "vision_tower",
@@ -446,7 +509,7 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
 
     model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path
     if not os.path.isdir(model_path):
-        model_path = download_hf_model(model_path)
+        model_path = download_or_get_path(model_path, platform=platform)
 
     if isinstance(model_path, str):
         if os.path.exists(os.path.join(model_path, "preprocessor_config.json")):

From 415107ae9348472fb5a8620f891fca98af884e2c Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 29 Oct 2025 01:52:14 -0400
Subject: [PATCH 2/6] add ut

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/test_cpu/test_model_scope.py | 51 +++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 test/test_cpu/test_model_scope.py

diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py
new file mode 100644
index 000000000..0663bcf12
--- /dev/null
+++ b/test/test_cpu/test_model_scope.py
@@ -0,0 +1,51 @@
+import copy
+import shutil
+import sys
+import unittest
+
+sys.path.insert(0, "../..")
+
+import torch
+
+from auto_round import AutoRound
+
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(3):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestModelScope(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.saved_path = "./saved"
+        self.dataset = LLMDataLoader()
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+
+        return super().tearDownClass()
+
+    def test_llm(self):
+        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+        autoround = AutoRound(
+            model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset
+        )
+        autoround.quantize_and_save()
+
+    def test_mllm(self):
+        model_name = "Qwen/Qwen2-VL-2B-Instruct"
+        autoround = AutoRound(
+            model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2
+        )
+        autoround.quantize_and_save(self.saved_path)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 65ef85cb4214224bf8d2fffe059b6cda51cdae2f Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 29 Oct 2025 02:25:17 -0400
Subject: [PATCH 3/6] env arg first

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 6d832f999..75448995f 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -234,6 +234,9 @@ def __init__(
         device = kwargs.pop("device", None)
         # Scale factor for RAM usage per parameter.
         mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
+
+        if os.getenv("MODEL_PLATFORM"):
+            platform = os.getenv("MODEL_PLATFORM").lower()
         self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False

From c40431722eac5aa522a63f6e0cb03bc57523344f Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 29 Oct 2025 20:17:54 -0400
Subject: [PATCH 4/6] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/autoround.py                       | 24 +++++++++----------
 auto_round/compressors/adam.py                |  6 ++---
 auto_round/compressors/base.py                |  8 +++----
 .../compressors/diffusion/compressor.py       |  6 ++---
 auto_round/compressors/mllm/compressor.py     |  6 ++---
 auto_round/envs.py                            |  4 ++--
 auto_round/utils/model.py                     | 11 +++++----
 7 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 7e717f3a9..c43b2e425 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -39,8 +39,8 @@ class AutoRound:
 
     Attributes:
         model (torch.nn.Module): The loaded PyTorch model in eval mode.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: Tokenizer used to prepare input text for calibration/tuning.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         bits (int): Weight quantization bits.
         group_size (int): Per-group size for weight quantization.
         sym (bool): Whether to use symmetric weight quantization.
@@ -64,8 +64,8 @@ class AutoRound:
     def __new__(
         cls,
         model: Union[torch.nn.Module, str],
-        platform: str = "hf",
         tokenizer=None,
+        platform: str = "hf",
         scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -168,8 +168,8 @@ def __new__(
             kwargs.update(extra_config.to_dict())
         ar = dynamic_compressor(
             model=model,
-            platform=platform,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             dataset=dataset,
@@ -313,8 +313,8 @@ class AutoRoundLLM(LLMCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
-        platform: str = "hf",
         tokenizer=None,
+        platform: str = "hf",
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -331,8 +331,8 @@ def __init__(
     ):
         super().__init__(
             model=model,
-            platform=platform,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             dataset=dataset,
@@ -355,8 +355,8 @@ class AutoRoundAdam(AdamCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
         bits (int): Number of bits for quantization (default is 4).
         group_size (int): Size of the quantization group (default is 128).
@@ -415,8 +415,8 @@ class AutoRoundAdam(AdamCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
-        platform: str = "hf",
         tokenizer=None,
+        platform: str = "hf",
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -434,8 +434,8 @@ def __init__(
     ):
         super().__init__(
             model=model,
-            platform=platform,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             batch_size=batch_size,
@@ -459,8 +459,8 @@ class AutoRoundMLLM(MLLMCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         processor: Any multi-modal model will require an object to encode or
                    decode the data that groups several modalities (among text, vision and audio).
         image_processor: Image processor for special model like llava.
@@ -518,8 +518,8 @@ class AutoRoundMLLM(MLLMCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
-        platform: str = "hf",
         tokenizer=None,
+        platform: str = "hf",
         processor=None,
         image_processor=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
@@ -539,8 +539,8 @@ def __init__(
     ):
         super().__init__(
             model=model,
-            platform=platform,
             tokenizer=tokenizer,
+            platform=platform,
             processor=processor,
             image_processor=image_processor,
             scheme=scheme,
@@ -566,8 +566,8 @@ class AutoRoundDiffusion(DiffusionCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         guidance_scale (float): Control how much the image generation process follows the text prompt.
                                 The more it is, the more closely it follows the prompt (default is 7.5).
         num_inference_steps (int): The reference number of denoising steps (default is 50).
diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py
index f0dda9155..fb79cf39a 100644
--- a/auto_round/compressors/adam.py
+++ b/auto_round/compressors/adam.py
@@ -26,8 +26,8 @@ class AdamCompressor(BaseCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
         bits (int): Number of bits for quantization (default is 4).
         group_size (int): Size of the quantization group (default is 128).
@@ -86,8 +86,8 @@ class AdamCompressor(BaseCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
-        platform="hf",
         tokenizer=None,
+        platform="hf",
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -105,8 +105,8 @@ def __init__(
     ):
         super(AdamCompressor, self).__init__(
             model=model,
-            platform=platform,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             batch_size=batch_size,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 75448995f..1bed3255c 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -103,8 +103,8 @@ class BaseCompressor(object):
 
     Attributes:
         model (torch.nn.Module): The loaded PyTorch model in eval mode.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: Tokenizer used to prepare input text for calibration/tuning.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         bits (int): Weight quantization bits.
         group_size (int): Per-group size for weight quantization.
         sym (bool): Whether to use symmetric weight quantization.
@@ -128,8 +128,8 @@ class BaseCompressor(object):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
-        platform="hf",
         tokenizer=None,
+        platform="hf",
         scheme: Union[str, dict, QuantizationScheme, AutoScheme] = "W4A16",
         layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
@@ -235,8 +235,8 @@ def __init__(
         # Scale factor for RAM usage per parameter.
         mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
 
-        if os.getenv("MODEL_PLATFORM"):
-            platform = os.getenv("MODEL_PLATFORM").lower()
+        if os.getenv("AUTOROUND_USE_MODELSCOPE", False):
+            platform = "model_scope"
         self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 7d644033d..904767602 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -46,8 +46,8 @@ class DiffusionCompressor(BaseCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         guidance_scale (float): Control how much the image generation process follows the text prompt.
                                 The more it is, the more closely it follows the prompt (default is 7.5).
         num_inference_steps (int): The reference number of denoising steps (default is 50).
@@ -81,8 +81,8 @@ class DiffusionCompressor(BaseCompressor):
     def __init__(
         self,
         model: Union[object, str],
-        platform: str = "hf",
         tokenizer=None,
+        platform: str = "hf",
         guidance_scale: float = 7.5,
         num_inference_steps: int = 50,
         generator_seed: int = None,
@@ -146,8 +146,8 @@ def __init__(
         kwargs["diffusion"] = True
         super(DiffusionCompressor, self).__init__(
             model=model,
-            platform=platform,
             tokenizer=None,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             dataset=dataset,
diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py
index 041a5e785..2cfa457b7 100644
--- a/auto_round/compressors/mllm/compressor.py
+++ b/auto_round/compressors/mllm/compressor.py
@@ -86,8 +86,8 @@ class MLLMCompressor(BaseCompressor):
 
     Args:
         model: The PyTorch model to be quantized.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         tokenizer: An optional tokenizer for processing input data.
+        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
         processor: Any multi-modal model will require an object to encode or
                    decode the data that groups several modalities (among text, vision and audio).
         image_processor: Image processor for special model like llava.
@@ -145,8 +145,8 @@ class MLLMCompressor(BaseCompressor):
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
-        platform: str = "hf",
         tokenizer=None,
+        platform: str = "hf",
         processor=None,
         image_processor=None,
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
@@ -259,8 +259,8 @@ def __init__(
         kwargs["mllm"] = True
         super(MLLMCompressor, self).__init__(
             model=model,
-            platform=platform,
             tokenizer=tokenizer,
+            platform=platform,
             scheme=scheme,
             layer_config=layer_config,
             dataset=dataset,
diff --git a/auto_round/envs.py b/auto_round/envs.py
index 9c9c503fd..43351d503 100644
--- a/auto_round/envs.py
+++ b/auto_round/envs.py
@@ -18,12 +18,12 @@
 
 if TYPE_CHECKING:
     AR_LOG_LEVEL: str = "INFO"
-    MODEL_PLATFORM: str = "HF"
+    AUTOROUND_USE_MODELSCOPE: bool = "False"
 
 environment_variables: dict[str, Callable[[], Any]] = {
     # this is used for configuring the default logging level
     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
-    "MODEL_PLATFORM": lambda: os.getenv("MODEL_PLATFORM", "HF").upper(),
+    "AUTOROUND_USE_MODELSCOPE": lambda: os.getenv("AUTOROUND_USE_MODELSCOPE ", "False").lower() in ["1", "true"],
 }
 
 
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 9415d5385..e2998196b 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -149,10 +149,13 @@ def check_start_with_block_name(name: str, block_name_to_quantize: list):
 
 
 def download_or_get_path(repo_id: str, platform: str = None) -> str:
-    from auto_round.envs import MODEL_PLATFORM
+    from auto_round.envs import AUTOROUND_USE_MODELSCOPE
 
     if platform is None:
-        platform = MODEL_PLATFORM.lower()
+        if AUTOROUND_USE_MODELSCOPE:
+            platform = "model_scope"
+        else:
+            platform = "hf"
 
     if platform == "model_scope":
         return download_modelscope_model(repo_id)
@@ -221,7 +224,7 @@ def llm_load_model(
         "hf",
         "model_scope",
     ], "current only support hf or model_scope platform to load pretrained model."
-    os.environ["MODEL_PLATFORM"] = platform.upper()
+    os.environ["AUTOROUND_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf"
     if platform == "model_scope":
         from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer
     else:
@@ -306,7 +309,7 @@ def mllm_load_model(
         "hf",
         "model_scope",
     ], "current only support hf or model_scope platform to load pretrained model."
-    os.environ["MODEL_PLATFORM"] = platform.upper()
+    os.environ["AUTOROUND_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf"
 
     if platform == "model_scope":
         import modelscope

From adf3955e8b1aef964fa2764f547bd8181f0ca2b1 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 29 Oct 2025 22:56:15 -0400
Subject: [PATCH 5/6] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py |  2 +-
 auto_round/envs.py             |  4 ++--
 auto_round/utils/model.py      | 20 ++++++++++----------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 1bed3255c..19a191f48 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -235,7 +235,7 @@ def __init__(
         # Scale factor for RAM usage per parameter.
         mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
 
-        if os.getenv("AUTOROUND_USE_MODELSCOPE", False):
+        if os.getenv("AR_USE_MODELSCOPE", False):
             platform = "model_scope"
         self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
diff --git a/auto_round/envs.py b/auto_round/envs.py
index 43351d503..6c09a2aad 100644
--- a/auto_round/envs.py
+++ b/auto_round/envs.py
@@ -18,12 +18,12 @@
 
 if TYPE_CHECKING:
     AR_LOG_LEVEL: str = "INFO"
-    AUTOROUND_USE_MODELSCOPE: bool = "False"
+    AR_USE_MODELSCOPE: bool = "False"
 
 environment_variables: dict[str, Callable[[], Any]] = {
     # this is used for configuring the default logging level
     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
-    "AUTOROUND_USE_MODELSCOPE": lambda: os.getenv("AUTOROUND_USE_MODELSCOPE ", "False").lower() in ["1", "true"],
+    "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE ", "False").lower() in ["1", "true"],
 }
 
 
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index e2998196b..9bd29dabf 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -149,10 +149,10 @@ def check_start_with_block_name(name: str, block_name_to_quantize: list):
 
 
 def download_or_get_path(repo_id: str, platform: str = None) -> str:
-    from auto_round.envs import AUTOROUND_USE_MODELSCOPE
+    from auto_round.envs import AR_USE_MODELSCOPE
 
     if platform is None:
-        if AUTOROUND_USE_MODELSCOPE:
+        if AR_USE_MODELSCOPE:
             platform = "model_scope"
         else:
             platform = "hf"
@@ -164,7 +164,7 @@ def download_or_get_path(repo_id: str, platform: str = None) -> str:
 
 
 def download_modelscope_model(repo_id: str, local_dir: str = None, cache_dir: str = None):
-    from modelscope.utils.file_utils import get_modelscope_cache_dir
+    from modelscope.utils.file_utils import get_modelscope_cache_dir  # pylint: disable=E0401
 
     system_cache = cache_dir if cache_dir is not None else get_modelscope_cache_dir()
     if local_dir:
@@ -176,7 +176,7 @@ def download_modelscope_model(repo_id: str, local_dir: str = None, cache_dir: st
     if os.path.exists(directory):
         return directory
     else:
-        from modelscope.hub.snapshot_download import snapshot_download
+        from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=E0401
 
         return snapshot_download(repo_id)
 
@@ -224,9 +224,9 @@ def llm_load_model(
         "hf",
         "model_scope",
     ], "current only support hf or model_scope platform to load pretrained model."
-    os.environ["AUTOROUND_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf"
+    os.environ["AR_USE_MODELSCOPE"] = "True" if platform.lower() == "model_scope" else "False"
     if platform == "model_scope":
-        from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer
+        from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer  # pylint: disable=E0401
     else:
         from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
 
@@ -309,11 +309,11 @@ def mllm_load_model(
         "hf",
         "model_scope",
     ], "current only support hf or model_scope platform to load pretrained model."
-    os.environ["AUTOROUND_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf"
+    os.environ["AR_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf"
 
     if platform == "model_scope":
-        import modelscope
-        from modelscope import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+        import modelscope  # pylint: disable=E0401
+        from modelscope import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer  # pylint: disable=E0401
 
         base_lib = modelscope
     else:
@@ -423,7 +423,7 @@ def mllm_load_model(
                 )
             try:
                 if platform == "model_scope":
-                    from modelscope import AutoImageProcessor
+                    from modelscope import AutoImageProcessor  # pylint: disable=E0401
                 else:
                     from transformers import AutoImageProcessor
 

From db96ade76f819592f64a5b6dcb8b08d56dfea65c Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 29 Oct 2025 23:34:28 -0400
Subject: [PATCH 6/6] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py |  3 ++-
 auto_round/envs.py             | 29 ++++++++++++++++++++++++++++-
 auto_round/utils/model.py      | 11 +++++++----
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 19a191f48..997dc64f3 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -30,6 +30,7 @@
 from tqdm import tqdm
 from transformers import set_seed
 
+from auto_round import envs
 from auto_round.compressors.utils import (
     block_forward,
     check_need_act_calibration,
@@ -235,7 +236,7 @@ def __init__(
         # Scale factor for RAM usage per parameter.
         mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
 
-        if os.getenv("AR_USE_MODELSCOPE", False):
+        if envs.AR_USE_MODELSCOPE:
             platform = "model_scope"
         self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
diff --git a/auto_round/envs.py b/auto_round/envs.py
index 6c09a2aad..3d9b0dfa3 100644
--- a/auto_round/envs.py
+++ b/auto_round/envs.py
@@ -23,7 +23,7 @@
 environment_variables: dict[str, Callable[[], Any]] = {
     # this is used for configuring the default logging level
     "AR_LOG_LEVEL": lambda: os.getenv("AR_LOG_LEVEL", "INFO").upper(),
-    "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE ", "False").lower() in ["1", "true"],
+    "AR_USE_MODELSCOPE": lambda: os.getenv("AR_USE_MODELSCOPE", "False").lower() in ["1", "true"],
 }
 
 
@@ -43,3 +43,30 @@ def is_set(name: str):
     if name in environment_variables:
         return name in os.environ
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def set_config(**kwargs):
+    """
+    Set configuration values for environment variables.
+
+    Args:
+        **kwargs: Keyword arguments where keys are environment variable names
+                 and values are the desired values to set.
+
+    Example:
+        set_config(AR_LOG_LEVEL="DEBUG", AR_USE_MODELSCOPE=True)
+    """
+    for key, value in kwargs.items():
+        if key in environment_variables:
+            # Convert value to appropriate string format
+            if key == "AR_USE_MODELSCOPE":
+                # Handle boolean values for AR_USE_MODELSCOPE
+                str_value = "true" if value in [True, "True", "true", "1", 1] else "false"
+            else:
+                # For other variables, convert to string
+                str_value = str(value)
+
+            # Set the environment variable
+            os.environ[key] = str_value
+        else:
+            raise AttributeError(f"module {__name__!r} has no attribute {key!r}")
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 9bd29dabf..427ec7f9c 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -23,6 +23,7 @@
 import torch
 import transformers
 
+from auto_round import envs
 from auto_round.export.export_to_gguf.config import ModelType
 from auto_round.logger import logger
 from auto_round.schemes import QuantizationScheme
@@ -149,10 +150,10 @@ def check_start_with_block_name(name: str, block_name_to_quantize: list):
 
 
 def download_or_get_path(repo_id: str, platform: str = None) -> str:
-    from auto_round.envs import AR_USE_MODELSCOPE
+    from auto_round import envs
 
     if platform is None:
-        if AR_USE_MODELSCOPE:
+        if envs.AR_USE_MODELSCOPE:
             platform = "model_scope"
         else:
             platform = "hf"
@@ -224,7 +225,8 @@ def llm_load_model(
         "hf",
         "model_scope",
     ], "current only support hf or model_scope platform to load pretrained model."
-    os.environ["AR_USE_MODELSCOPE"] = "True" if platform.lower() == "model_scope" else "False"
+    if platform.lower() == "model_scope" and not envs.AR_USE_MODELSCOPE:
+        envs.set_config(AR_USE_MODELSCOPE=True)
     if platform == "model_scope":
         from modelscope import AutoModel, AutoModelForCausalLM, AutoTokenizer  # pylint: disable=E0401
     else:
@@ -309,7 +311,8 @@ def mllm_load_model(
         "hf",
         "model_scope",
     ], "current only support hf or model_scope platform to load pretrained model."
-    os.environ["AR_USE_MODELSCOPE"] = "model_scope" if platform.lower() == "model_scope" else "hf"
+    if platform.lower() == "model_scope" and not envs.AR_USE_MODELSCOPE:
+        envs.set_config(AR_USE_MODELSCOPE=True)
 
     if platform == "model_scope":
         import modelscope  # pylint: disable=E0401