From bad0db23651bd3b659afbff7b60cca2b3395a3ea Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 9 Sep 2025 03:24:51 -0400
Subject: [PATCH 1/8] fix cuda ut

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/test_cuda/test_transformers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py
index 6f953339d..5fa2fef50 100644
--- a/test/test_cuda/test_transformers.py
+++ b/test/test_cuda/test_transformers.py
@@ -30,6 +30,7 @@
 if is_torch_available():
     import torch
 
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
 # @slow
 @require_torch_gpu

From 26e6a41baab0e81804ef53a6f86d7ace3f08f9dd Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 9 Sep 2025 03:46:21 -0400
Subject: [PATCH 2/8] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/autoround.py             | 11 +++++++++--
 auto_round/script/llm.py            | 10 +++++++++-
 auto_round/script/mllm.py           | 12 ++++++++++--
 test/test_cuda/test_transformers.py |  2 --
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 921747682..421f6c982 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -224,7 +224,8 @@ def __init__(
         to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None)
         enable_norm_bias_tuning: bool = kwargs.pop("enable_norm_bias_tuning", False)
         enable_quanted_input: bool = kwargs.pop("enable_quanted_input", True)
-        disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", False)
+        disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", True)
+        enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False)
         static_kv_dtype = kwargs.pop("static_kv_dtype", None)
         device = kwargs.pop("device", None)
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
@@ -235,8 +236,14 @@ def __init__(
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
 
-        if not disable_deterministic_algorithms:
+        # deprecated, default not to use torch.use_deterministic_algorithms
+        if not disable_deterministic_algorithms or enable_deterministic_algorithms:
+            if not disable_deterministic_algorithms:
+                logger.warning(
+                    "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
+                    " please use enable_deterministic_algorithms instead. ")
             if "CUBLAS_WORKSPACE_CONFIG" not in os.environ:
+                breakpoint()
                 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
             torch.use_deterministic_algorithms(True, warn_only=False)
 
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index cbf30edb1..bd23dab1a 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -208,6 +208,9 @@ def __init__(self, *args, **kwargs):
         self.add_argument(
             "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms."
         )
+        self.add_argument(
+            "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms."
+        )
 
         self.add_argument(
             "--disable_opt_rtn",
@@ -543,6 +546,11 @@ def tune(args):
     scheme = args.scheme.upper()
     if scheme not in PRESET_SCHEMES:
         raise ValueError(f"{scheme} is not supported. only {PRESET_SCHEMES.keys()} are supported ")
+    if args.disable_deterministic_algorithms:
+        logger.warning(
+            "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
+            " please use enable_deterministic_algorithms instead. ")
+    enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms
     autoround = round(
         model=model,
         tokenizer=tokenizer,
@@ -580,7 +588,7 @@ def tune(args):
         super_group_size=args.super_group_size,
         super_bits=args.super_bits,
         disable_opt_rtn=args.disable_opt_rtn,
-        disable_deterministic_algorithms=args.disable_deterministic_algorithms,
+        enable_deterministic_algorithms=enable_deterministic_algorithms,
         enable_alg_ext=args.enable_alg_ext,
         **mllm_kwargs,
     )
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index 162b2ee54..a3cd46994 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -176,6 +176,10 @@ def __init__(self, *args, **kwargs):
             "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms."
         )
 
+        self.add_argument(
+            "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms."
+        )
+
         ## ======================= VLM =======================
         self.add_argument(
             "--quant_nontext_module",
@@ -435,7 +439,11 @@ def tune(args):
     scheme = args.scheme.upper()
     if scheme not in PRESET_SCHEMES:
         raise ValueError(f"{scheme} is not supported. only {PRESET_SCHEMES.keys()} are supported ")
-
+    if args.disable_deterministic_algorithms:
+        logger.warning(
+            "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
+            " please use enable_deterministic_algorithms instead. ")
+    enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms
     autoround = round(
         model,
         tokenizer,
@@ -473,7 +481,7 @@ def tune(args):
         model_kwargs=model_kwargs,
         data_type=args.data_type,
         disable_opt_rtn=args.disable_opt_rtn,
-        disable_deterministic_algorithms=args.disable_deterministic_algorithms,
+        enable_deterministic_algorithms=enable_deterministic_algorithms,
     )
 
     model_name = args.model.rstrip("/")
diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py
index 5fa2fef50..0b9046069 100644
--- a/test/test_cuda/test_transformers.py
+++ b/test/test_cuda/test_transformers.py
@@ -30,8 +30,6 @@
 if is_torch_available():
     import torch
 
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-
 # @slow
 @require_torch_gpu
 @require_accelerate

From a6a9ce1ae627d5de203baed1921f9b7a66b40456 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 9 Sep 2025 03:46:50 -0400
Subject: [PATCH 3/8] clean

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/autoround.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 421f6c982..7f38e81d1 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -243,7 +243,6 @@ def __init__(
                     "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
                     " please use enable_deterministic_algorithms instead. ")
             if "CUBLAS_WORKSPACE_CONFIG" not in os.environ:
-                breakpoint()
                 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
             torch.use_deterministic_algorithms(True, warn_only=False)
 

From d5ffa28bedf3e9e4042749428debf49727936534 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 9 Sep 2025 07:47:24 +0000
Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/autoround.py             | 5 +++--
 auto_round/script/llm.py            | 7 ++++---
 auto_round/script/mllm.py           | 7 ++++---
 test/test_cuda/test_transformers.py | 1 +
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 7f38e81d1..27e9f8416 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -240,8 +240,9 @@ def __init__(
         if not disable_deterministic_algorithms or enable_deterministic_algorithms:
             if not disable_deterministic_algorithms:
                 logger.warning(
-                    "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
-                    " please use enable_deterministic_algorithms instead. ")
+                    "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
+                    " please use enable_deterministic_algorithms instead. "
+                )
             if "CUBLAS_WORKSPACE_CONFIG" not in os.environ:
                 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
             torch.use_deterministic_algorithms(True, warn_only=False)
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index bd23dab1a..dd98bd740 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -209,7 +209,7 @@ def __init__(self, *args, **kwargs):
             "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms."
         )
         self.add_argument(
-            "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms."
+            "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms."
         )
 
         self.add_argument(
@@ -548,8 +548,9 @@ def tune(args):
         raise ValueError(f"{scheme} is not supported. only {PRESET_SCHEMES.keys()} are supported ")
     if args.disable_deterministic_algorithms:
         logger.warning(
-            "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
-            " please use enable_deterministic_algorithms instead. ")
+            "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
+            " please use enable_deterministic_algorithms instead. "
+        )
     enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms
     autoround = round(
         model=model,
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index a3cd46994..54286b15e 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -177,7 +177,7 @@ def __init__(self, *args, **kwargs):
         )
 
         self.add_argument(
-            "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms."
+            "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms."
         )
 
         ## ======================= VLM =======================
@@ -441,8 +441,9 @@ def tune(args):
         raise ValueError(f"{scheme} is not supported. only {PRESET_SCHEMES.keys()} are supported ")
     if args.disable_deterministic_algorithms:
         logger.warning(
-            "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
-            " please use enable_deterministic_algorithms instead. ")
+            "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
+            " please use enable_deterministic_algorithms instead. "
+        )
     enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms
     autoround = round(
         model,
diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py
index 0b9046069..6f953339d 100644
--- a/test/test_cuda/test_transformers.py
+++ b/test/test_cuda/test_transformers.py
@@ -30,6 +30,7 @@
 if is_torch_available():
     import torch
 
+
 # @slow
 @require_torch_gpu
 @require_accelerate

From de8787f813dd54fb5eacd0e75477e1887a9030a2 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 9 Sep 2025 04:09:45 -0400
Subject: [PATCH 5/8] upadate

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/script/llm.py  | 5 ++---
 auto_round/script/mllm.py | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index bd23dab1a..2b990642e 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -206,7 +206,7 @@ def __init__(self, *args, **kwargs):
         self.add_argument("--enable_alg_ext", action="store_true", help="whether to enable probably better algorithm")
 
         self.add_argument(
-            "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms."
+            "--disable_deterministic_algorithms", action="store_true", help="deprecated, disable torch deterministic algorithms."
         )
         self.add_argument(
             "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms."
@@ -550,7 +550,6 @@ def tune(args):
         logger.warning(
             "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
             " please use enable_deterministic_algorithms instead. ")
-    enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms
     autoround = round(
         model=model,
         tokenizer=tokenizer,
@@ -588,7 +587,7 @@ def tune(args):
         super_group_size=args.super_group_size,
         super_bits=args.super_bits,
         disable_opt_rtn=args.disable_opt_rtn,
-        enable_deterministic_algorithms=enable_deterministic_algorithms,
+        enable_deterministic_algorithms=args.enable_deterministic_algorithms,
         enable_alg_ext=args.enable_alg_ext,
         **mllm_kwargs,
     )
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index a3cd46994..b985946f8 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -173,7 +173,7 @@ def __init__(self, *args, **kwargs):
         self.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch compile")
 
         self.add_argument(
-            "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms."
+            "--disable_deterministic_algorithms", action="store_true", help="deprecated, disable torch deterministic algorithms."
         )
 
         self.add_argument(
@@ -443,7 +443,6 @@ def tune(args):
         logger.warning(
             "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
             " please use enable_deterministic_algorithms instead. ")
-    enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms
     autoround = round(
         model,
         tokenizer,
@@ -481,7 +480,7 @@ def tune(args):
         model_kwargs=model_kwargs,
         data_type=args.data_type,
         disable_opt_rtn=args.disable_opt_rtn,
-        enable_deterministic_algorithms=enable_deterministic_algorithms,
+        enable_deterministic_algorithms=args.enable_deterministic_algorithms,
     )
 
     model_name = args.model.rstrip("/")

From 6d96f271ebee38f6fe495011559406fbbe6f4885 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 9 Sep 2025 04:11:48 -0400
Subject: [PATCH 6/8] clean

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/script/llm.py  | 1 -
 auto_round/script/mllm.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index 199e66a4d..464dc812c 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -521,7 +521,6 @@ def tune(args):
         logger.warning(
             "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
             " please use enable_deterministic_algorithms instead. ")
-    enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms
     autoround = round(
         model=model,
         tokenizer=tokenizer,
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index a18dc3fa2..b1637dbfa 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -444,7 +444,6 @@ def tune(args):
             "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
             " please use enable_deterministic_algorithms instead. "
         )
-    enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms
     autoround = round(
         model,
         tokenizer,

From 29e229ae5b322802b9042c6517492edcbcfd994a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 9 Sep 2025 08:12:39 +0000
Subject: [PATCH 7/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/script/llm.py  | 125 +++++++++++++++++++++++++-------------
 auto_round/script/mllm.py |   4 +-
 2 files changed, 87 insertions(+), 42 deletions(-)

diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
index 464dc812c..1d0ee5b2b 100644
--- a/auto_round/script/llm.py
+++ b/auto_round/script/llm.py
@@ -48,7 +48,8 @@ class BasicArgumentParser(argparse.ArgumentParser):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.add_argument(
-            "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path")
+            "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path"
+        )
 
         self.add_argument("--mllm", action="store_true", help="whether to quant multi-modal model.")
 
@@ -69,10 +70,12 @@ def __init__(self, *args, **kwargs):
         self.add_argument("--act_bits", default=None, type=int, help="activation bits")
         self.add_argument("--act_group_size", default=None, type=int, help="activation group size")
         self.add_argument(
-            "--super_group_size", default=None, type=int, help="the number of super group size when use double quant.")
+            "--super_group_size", default=None, type=int, help="the number of super group size when use double quant."
+        )
 
         self.add_argument(
-            "--super_bits", default=None, type=int, help="number of scale and mins quant bits for double quant.")
+            "--super_bits", default=None, type=int, help="number of scale and mins quant bits for double quant."
+        )
         self.add_argument("--act_data_type", "--act_dtype", default=None, type=str, help="activation data type")
 
         self.add_argument("--disable_act_dynamic", action="store_true", help="activation static quantization")
@@ -91,7 +94,8 @@ def __init__(self, *args, **kwargs):
         )
 
         self.add_argument(
-            "--dataset", default="NeelNanda/pile-10k", type=str, help="the dataset for quantization training")
+            "--dataset", default="NeelNanda/pile-10k", type=str, help="the dataset for quantization training"
+        )
 
         self.add_argument(
             "--minmax_lr",
@@ -127,17 +131,20 @@ def __init__(self, *args, **kwargs):
         )
 
         self.add_argument(
-            "--output_dir", default="./tmp_autoround", type=str, help="the directory to save quantized model")
+            "--output_dir", default="./tmp_autoround", type=str, help="the directory to save quantized model"
+        )
 
         self.add_argument("--disable_amp", action="store_true", help="disable amp")
 
         self.add_argument(
-            "--disable_minmax_tuning", action="store_true", help="whether to disable enable weight minmax tuning")
+            "--disable_minmax_tuning", action="store_true", help="whether to disable enable weight minmax tuning"
+        )
 
         self.add_argument("--enable_norm_bias_tuning", action="store_true", help="whether to enable norm bias tuning")
 
         self.add_argument(
-            "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code")
+            "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code"
+        )
 
         self.add_argument(
             "--disable_quanted_input",
@@ -178,7 +185,8 @@ def __init__(self, *args, **kwargs):
         )
 
         self.add_argument(
-            "--fp_layers", default="", type=str, help="list of Layer names to maintain original data type")
+            "--fp_layers", default="", type=str, help="list of Layer names to maintain original data type"
+        )
 
         self.add_argument(
             "--not_use_best_mse",
@@ -200,9 +208,11 @@ def __init__(self, *args, **kwargs):
         self.add_argument(
             "--disable_deterministic_algorithms",
             action="store_true",
-            help="deprecated, disable torch deterministic algorithms.")
+            help="deprecated, disable torch deterministic algorithms.",
+        )
         self.add_argument(
-            "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms.")
+            "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms."
+        )
 
         self.add_argument(
             "--disable_opt_rtn",
@@ -237,7 +247,8 @@ def __init__(self, *args, **kwargs):
 
         ## ======================= eval =======================
         self.add_argument(
-            "--disable_eval", action="store_true", help="whether to disable lm-eval evaluation after tuning")
+            "--disable_eval", action="store_true", help="whether to disable lm-eval evaluation after tuning"
+        )
 
         self.add_argument(
             "--tasks",
@@ -263,7 +274,8 @@ def __init__(self, *args, **kwargs):
         self.add_argument("--eval_task_by_task", action="store_true", help="whether to eval task by task.")
 
         self.add_argument(
-            "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation.")
+            "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation."
+        )
 
 
 class EvalArgumentParser(argparse.ArgumentParser):
@@ -271,7 +283,8 @@ class EvalArgumentParser(argparse.ArgumentParser):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.add_argument(
-            "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path")
+            "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path"
+        )
         self.add_argument("--mllm", action="store_true", help="whether to eval multi-modal model.")
         self.add_argument(
             "--device_map",
@@ -294,11 +307,13 @@ def __init__(self, *args, **kwargs):
             help="lm-eval tasks",
         )
         self.add_argument(
-            "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code")
+            "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code"
+        )
         self.add_argument("--eval_bs", "--bs", "--batch_size", default=None, type=int, help="batch size in evaluation")
         self.add_argument("--eval_task_by_task", action="store_true", help="whether to eval task by task.")
         self.add_argument(
-            "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation.")
+            "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation."
+        )
         self.add_argument(
             "--limit",
             type=float,
@@ -317,12 +332,14 @@ def setup_parser():
     parser.add_argument("--iters", "--iter", default=200, type=int, help="iteration to tune each block")
 
     parser.add_argument(
-        "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples")
+        "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples"
+    )
 
     parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples")
 
     parser.add_argument(
-        "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically")
+        "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically"
+    )
 
     args = parser.parse_args()
     return args
@@ -336,12 +353,14 @@ def setup_best_parser():
     parser.add_argument("--iters", "--iter", default=1000, type=int, help="iterations to tune each block")
 
     parser.add_argument(
-        "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples")
+        "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples"
+    )
 
     parser.add_argument("--nsamples", "--nsample", default=512, type=int, help="number of samples")
 
     parser.add_argument(
-        "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically")
+        "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically"
+    )
 
     args = parser.parse_args()
     args.low_gpu_mem_usage = True
@@ -357,12 +376,14 @@ def setup_light_parser():
     parser.add_argument("--iters", "--iter", default=50, type=int, help="iterations to tune each block")
 
     parser.add_argument(
-        "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples")
+        "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples"
+    )
 
     parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples")
 
     parser.add_argument(
-        "--lr", default=5e-3, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically")
+        "--lr", default=5e-3, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically"
+    )
 
     args = parser.parse_args()
 
@@ -377,12 +398,14 @@ def setup_fast_parser():
     parser.add_argument("--iters", default=200, type=int, help="iterations to tune each block")
 
     parser.add_argument(
-        "--seqlen", "--seq_len", default=512, type=int, help="sequence length of the calibration samples")
+        "--seqlen", "--seq_len", default=512, type=int, help="sequence length of the calibration samples"
+    )
 
     parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples")
 
     parser.add_argument(
-        "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically")
+        "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically"
+    )
 
     args = parser.parse_args()
 
@@ -418,7 +441,8 @@ def tune(args):
     if "auto_gptq" in args.format and args.asym is True:
         logger.warning(
             "the auto_gptq kernel has issues with asymmetric quantization. "
-            "It is recommended to use sym quantization or --format='auto_round'")
+            "It is recommended to use sym quantization or --format='auto_round'"
+        )
 
     if "marlin" in args.format and args.asym is True:
         raise RuntimeError("marlin backend only supports sym quantization, please remove --asym")
@@ -432,7 +456,8 @@ def tune(args):
     if args.enable_torch_compile:
         logger.info(
             "`torch.compile` is enabled to reduce tuning costs. "
-            "If it causes issues, you can disable it by removing `--enable_torch_compile` argument.")
+            "If it causes issues, you can disable it by removing `--enable_torch_compile` argument."
+        )
 
     model_name = args.model
     if model_name[-1] == "/":
@@ -487,8 +512,12 @@ def tune(args):
     if len(not_quantize_layer_names) > 0:
         logger.info(f"{not_quantize_layer_names} will not be quantized.")
         for format in formats:
-            if ("auto_round" not in format and "fake" not in format and "awq" not in format and
-                    "llm_compressor" not in format):
+            if (
+                "auto_round" not in format
+                and "fake" not in format
+                and "awq" not in format
+                and "llm_compressor" not in format
+            ):
                 # TODO gptq could support some mixed precision config
                 logger.warning(f"mixed precision exporting does not support {format} currently")
 
@@ -497,13 +526,15 @@ def tune(args):
             if "auto_round" not in format and "fake" not in format:
                 auto_round_formats = [s for s in SUPPORTED_FORMATS if s.startswith("auto_round")]
                 raise ValueError(
-                    f"{format} is not supported for lm-head quantization, please change to {auto_round_formats}")
+                    f"{format} is not supported for lm-head quantization, please change to {auto_round_formats}"
+                )
 
     if "auto_awq" in args.format:
         from auto_round.utils import check_awq_gemm_compatibility
 
         awq_supported, info = check_awq_gemm_compatibility(
-            model, args.bits, args.group_size, not args.asym, layer_config)
+            model, args.bits, args.group_size, not args.asym, layer_config
+        )
         if not awq_supported:
             logger.warning(f"The AutoAWQ format may not be supported due to {info}")
 
@@ -520,7 +551,8 @@ def tune(args):
     if args.disable_deterministic_algorithms:
         logger.warning(
             "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
-            " please use enable_deterministic_algorithms instead. ")
+            " please use enable_deterministic_algorithms instead. "
+        )
     autoround = round(
         model=model,
         tokenizer=tokenizer,
@@ -642,12 +674,14 @@ def tune(args):
             if eval_model_dtype == "float32" or eval_model_dtype == "auto":
                 logger.warning(
                     "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model,"
-                    " but may affect accuracy.")
+                    " but may affect accuracy."
+                )
             if gguf_file is None:
                 logger.error("Cannot find correct gguf file for evaluation, please check.")
                 sys.exit(-1)
             model = AutoModelForCausalLM.from_pretrained(
-                eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype)
+                eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+            )
             model.eval()
             tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file)
         else:
@@ -707,7 +741,8 @@ def tune(args):
             from auto_round.eval.evaluation import simple_evaluate
 
             tasks, model_args, device_str = _eval_init(
-                args.tasks, eval_folder, args.device_map, args.disable_trust_remote_code, dtype=eval_model_dtype)
+                args.tasks, eval_folder, args.device_map, args.disable_trust_remote_code, dtype=eval_model_dtype
+            )
             st = time.time()
             if "llama" in args.model.lower():
                 model_args += ",add_bos_token=True"
@@ -741,7 +776,8 @@ def eval(args):
     import time
 
     tasks, model_args, device_str = _eval_init(
-        args.tasks, args.model, args.device_map, args.disable_trust_remote_code, args.eval_model_dtype)
+        args.tasks, args.model, args.device_map, args.disable_trust_remote_code, args.eval_model_dtype
+    )
 
     # load after _eval_int in order to make sure import torch after set CUDA_VISIBLE_DEVICES
     from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model
@@ -771,13 +807,16 @@ def eval(args):
         if eval_model_dtype == "float32" or eval_model_dtype == "auto":
             logger.warning(
                 "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model,"
-                " but may affect accuracy.")
+                " but may affect accuracy."
+            )
         model = AutoModelForCausalLM.from_pretrained(
-            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype)
+            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+        )
         model.eval()
         st = time.time()
         res = simple_evaluate_user_model(
-            model, tokenizer, tasks=tasks, batch_size=batch_size, device=device_str, limit=args.limit)
+            model, tokenizer, tasks=tasks, batch_size=batch_size, device=device_str, limit=args.limit
+        )
         print(make_table(res))
         print("evaluation running time=%ds" % (time.time() - st))
     else:
@@ -845,10 +884,12 @@ def eval_task_by_task(
         if eval_model_dtype == "float32" or eval_model_dtype == "auto":
             logger.warning(
                 "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model,"
-                " but may affect accuracy.")
+                " but may affect accuracy."
+            )
 
         model = AutoModelForCausalLM.from_pretrained(
-            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype)
+            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+        )
         model.eval()
         parallelism = False
     hflm = HFLM(
@@ -876,7 +917,8 @@ def eval_task_by_task(
         while retry_times:
             try:
                 res = lm_simple_evaluate(
-                    model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit)
+                    model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit
+                )
                 break
             except Exception as e:
                 cuda_error_msg = traceback.format_exc()
@@ -887,7 +929,8 @@ def eval_task_by_task(
                             hflm.batch_sizes[k] = max(v // 2, 1)
                         logger.warning(f"Out of memory, reset batch_size to {hflm.batch_sizes} and re-try.")
                         res = lm_simple_evaluate(
-                            model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit)
+                            model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit
+                        )
                         hflm.batch_sizes = ori_batch_sizes
                     except Exception as e:
                         traceback.print_exc()
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index b1637dbfa..bbb2b42eb 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -173,7 +173,9 @@ def __init__(self, *args, **kwargs):
         self.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch compile")
 
         self.add_argument(
-            "--disable_deterministic_algorithms", action="store_true", help="deprecated, disable torch deterministic algorithms."
+            "--disable_deterministic_algorithms",
+            action="store_true",
+            help="deprecated, disable torch deterministic algorithms.",
         )
 
         self.add_argument(

From cca567e33e4c45886a57252acd14e7083580130f Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Tue, 9 Sep 2025 22:24:48 +0800
Subject: [PATCH 8/8] set use_deterministic_algorithms with warn_only=True as
 default

---
 auto_round/autoround.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 35786ab91..03e012578 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -235,7 +235,8 @@ def __init__(
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
-
+        if "CUBLAS_WORKSPACE_CONFIG" not in os.environ:
+            os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
         # deprecated, default not to use torch.use_deterministic_algorithms
         if not disable_deterministic_algorithms or enable_deterministic_algorithms:
             if not disable_deterministic_algorithms:
@@ -243,9 +244,10 @@ def __init__(
                     "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated,"
                     " please use enable_deterministic_algorithms instead. "
                 )
-            if "CUBLAS_WORKSPACE_CONFIG" not in os.environ:
-                os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
             torch.use_deterministic_algorithms(True, warn_only=False)
+        else:
+            torch.use_deterministic_algorithms(True, warn_only=True)
 
         if device is not None:
             logger.warning("`device` is deprecated, please use `device_map` instead")