From bad0db23651bd3b659afbff7b60cca2b3395a3ea Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 9 Sep 2025 03:24:51 -0400 Subject: [PATCH 1/8] fix cuda ut Signed-off-by: n1ck-guo --- test/test_cuda/test_transformers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index 6f953339d..5fa2fef50 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -30,6 +30,7 @@ if is_torch_available(): import torch +os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # @slow @require_torch_gpu From 26e6a41baab0e81804ef53a6f86d7ace3f08f9dd Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 9 Sep 2025 03:46:21 -0400 Subject: [PATCH 2/8] fix Signed-off-by: n1ck-guo --- auto_round/autoround.py | 11 +++++++++-- auto_round/script/llm.py | 10 +++++++++- auto_round/script/mllm.py | 12 ++++++++++-- test/test_cuda/test_transformers.py | 2 -- 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 921747682..421f6c982 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -224,7 +224,8 @@ def __init__( to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None) enable_norm_bias_tuning: bool = kwargs.pop("enable_norm_bias_tuning", False) enable_quanted_input: bool = kwargs.pop("enable_quanted_input", True) - disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", False) + disable_deterministic_algorithms = kwargs.pop("disable_deterministic_algorithms", True) + enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False) static_kv_dtype = kwargs.pop("static_kv_dtype", None) device = kwargs.pop("device", None) self.quant_lm_head = kwargs.pop("quant_lm_head", False) @@ -235,8 +236,14 @@ def __init__( if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") - if not disable_deterministic_algorithms: + # deprecated, default not to use torch.use_deterministic_algorithms + if not disable_deterministic_algorithms or enable_deterministic_algorithms: + if not disable_deterministic_algorithms: + logger.warning( + "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," + " please use enable_deterministic_algorithms instead. ") if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: + breakpoint() os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" torch.use_deterministic_algorithms(True, warn_only=False) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index cbf30edb1..bd23dab1a 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -208,6 +208,9 @@ def __init__(self, *args, **kwargs): self.add_argument( "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms." ) + self.add_argument( + "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms." + ) self.add_argument( "--disable_opt_rtn", @@ -543,6 +546,11 @@ def tune(args): scheme = args.scheme.upper() if scheme not in PRESET_SCHEMES: raise ValueError(f"{scheme} is not supported. only {PRESET_SCHEMES.keys()} are supported ") + if args.disable_deterministic_algorithms: + logger.warning( + "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," + " please use enable_deterministic_algorithms instead. ") + enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms autoround = round( model=model, tokenizer=tokenizer, @@ -580,7 +588,7 @@ def tune(args): super_group_size=args.super_group_size, super_bits=args.super_bits, disable_opt_rtn=args.disable_opt_rtn, - disable_deterministic_algorithms=args.disable_deterministic_algorithms, + enable_deterministic_algorithms=enable_deterministic_algorithms, enable_alg_ext=args.enable_alg_ext, **mllm_kwargs, ) diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index 162b2ee54..a3cd46994 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -176,6 +176,10 @@ def __init__(self, *args, **kwargs): "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms." ) + self.add_argument( + "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms." + ) + ## ======================= VLM ======================= self.add_argument( "--quant_nontext_module", @@ -435,7 +439,11 @@ def tune(args): scheme = args.scheme.upper() if scheme not in PRESET_SCHEMES: raise ValueError(f"{scheme} is not supported. only {PRESET_SCHEMES.keys()} are supported ") - + if args.disable_deterministic_algorithms: + logger.warning( + "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," + " please use enable_deterministic_algorithms instead. ") + enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms autoround = round( model, tokenizer, @@ -473,7 +481,7 @@ def tune(args): model_kwargs=model_kwargs, data_type=args.data_type, disable_opt_rtn=args.disable_opt_rtn, - disable_deterministic_algorithms=args.disable_deterministic_algorithms, + enable_deterministic_algorithms=enable_deterministic_algorithms, ) model_name = args.model.rstrip("/") diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index 5fa2fef50..0b9046069 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -30,8 +30,6 @@ if is_torch_available(): import torch -os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" - # @slow @require_torch_gpu @require_accelerate From a6a9ce1ae627d5de203baed1921f9b7a66b40456 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 9 Sep 2025 03:46:50 -0400 Subject: [PATCH 3/8] clean Signed-off-by: n1ck-guo --- auto_round/autoround.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 421f6c982..7f38e81d1 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -243,7 +243,6 @@ def __init__( "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," " please use enable_deterministic_algorithms instead. ") if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: - breakpoint() os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" torch.use_deterministic_algorithms(True, warn_only=False) From d5ffa28bedf3e9e4042749428debf49727936534 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 Sep 2025 07:47:24 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/autoround.py | 5 +++-- auto_round/script/llm.py | 7 ++++--- auto_round/script/mllm.py | 7 ++++--- test/test_cuda/test_transformers.py | 1 + 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 7f38e81d1..27e9f8416 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -240,8 +240,9 @@ def __init__( if not disable_deterministic_algorithms or enable_deterministic_algorithms: if not disable_deterministic_algorithms: logger.warning( - "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," - " please use enable_deterministic_algorithms instead. ") + "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," + " please use enable_deterministic_algorithms instead. " + ) if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" torch.use_deterministic_algorithms(True, warn_only=False) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index bd23dab1a..dd98bd740 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -209,7 +209,7 @@ def __init__(self, *args, **kwargs): "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms." ) self.add_argument( - "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms." + "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms." ) self.add_argument( @@ -548,8 +548,9 @@ def tune(args): raise ValueError(f"{scheme} is not supported. only {PRESET_SCHEMES.keys()} are supported ") if args.disable_deterministic_algorithms: logger.warning( - "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," - " please use enable_deterministic_algorithms instead. ") + "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," + " please use enable_deterministic_algorithms instead. " + ) enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms autoround = round( model=model, diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index a3cd46994..54286b15e 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -177,7 +177,7 @@ def __init__(self, *args, **kwargs): ) self.add_argument( - "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms." + "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms." ) ## ======================= VLM ======================= @@ -441,8 +441,9 @@ def tune(args): raise ValueError(f"{scheme} is not supported. only {PRESET_SCHEMES.keys()} are supported ") if args.disable_deterministic_algorithms: logger.warning( - "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," - " please use enable_deterministic_algorithms instead. ") + "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," + " please use enable_deterministic_algorithms instead. " + ) enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms autoround = round( model, diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index 0b9046069..6f953339d 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -30,6 +30,7 @@ if is_torch_available(): import torch + # @slow @require_torch_gpu @require_accelerate From de8787f813dd54fb5eacd0e75477e1887a9030a2 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 9 Sep 2025 04:09:45 -0400 Subject: [PATCH 5/8] upadate Signed-off-by: n1ck-guo --- auto_round/script/llm.py | 5 ++--- auto_round/script/mllm.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index bd23dab1a..2b990642e 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -206,7 +206,7 @@ def __init__(self, *args, **kwargs): self.add_argument("--enable_alg_ext", action="store_true", help="whether to enable probably better algorithm") self.add_argument( - "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms." + "--disable_deterministic_algorithms", action="store_true", help="deprecated, disable torch deterministic algorithms." ) self.add_argument( "--enable_deterministic_algorithms", action="store_true", help="enbale torch deterministic algorithms." @@ -550,7 +550,6 @@ def tune(args): logger.warning( "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," " please use enable_deterministic_algorithms instead. ") - enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms autoround = round( model=model, tokenizer=tokenizer, @@ -588,7 +587,7 @@ def tune(args): super_group_size=args.super_group_size, super_bits=args.super_bits, disable_opt_rtn=args.disable_opt_rtn, - enable_deterministic_algorithms=enable_deterministic_algorithms, + enable_deterministic_algorithms=args.enable_deterministic_algorithms, enable_alg_ext=args.enable_alg_ext, **mllm_kwargs, ) diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index a3cd46994..b985946f8 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -173,7 +173,7 @@ def __init__(self, *args, **kwargs): self.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch compile") self.add_argument( - "--disable_deterministic_algorithms", action="store_true", help="disable torch deterministic algorithms." + "--disable_deterministic_algorithms", action="store_true", help="deprecated, disable torch deterministic algorithms." ) self.add_argument( @@ -443,7 +443,6 @@ def tune(args): logger.warning( "deafult not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," " please use enable_deterministic_algorithms instead. ") - enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms autoround = round( model, tokenizer, @@ -481,7 +480,7 @@ def tune(args): model_kwargs=model_kwargs, data_type=args.data_type, disable_opt_rtn=args.disable_opt_rtn, - enable_deterministic_algorithms=enable_deterministic_algorithms, + enable_deterministic_algorithms=args.enable_deterministic_algorithms, ) model_name = args.model.rstrip("/") From 6d96f271ebee38f6fe495011559406fbbe6f4885 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 9 Sep 2025 04:11:48 -0400 Subject: [PATCH 6/8] clean Signed-off-by: n1ck-guo --- auto_round/script/llm.py | 1 - auto_round/script/mllm.py | 1 - 2 files changed, 2 deletions(-) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index 199e66a4d..464dc812c 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -521,7 +521,6 @@ def tune(args): logger.warning( "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," " please use enable_deterministic_algorithms instead. ") - enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms autoround = round( model=model, tokenizer=tokenizer, diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index a18dc3fa2..b1637dbfa 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -444,7 +444,6 @@ def tune(args): "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," " please use enable_deterministic_algorithms instead. " ) - enable_deterministic_algorithms = args.enable_deterministic_algorithms and not args.disable_deterministic_algorithms autoround = round( model, tokenizer, From 29e229ae5b322802b9042c6517492edcbcfd994a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 Sep 2025 08:12:39 +0000 Subject: [PATCH 7/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/script/llm.py | 125 +++++++++++++++++++++++++------------- auto_round/script/mllm.py | 4 +- 2 files changed, 87 insertions(+), 42 deletions(-) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index 464dc812c..1d0ee5b2b 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -48,7 +48,8 @@ class BasicArgumentParser(argparse.ArgumentParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.add_argument( - "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path") + "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path" + ) self.add_argument("--mllm", action="store_true", help="whether to quant multi-modal model.") @@ -69,10 +70,12 @@ def __init__(self, *args, **kwargs): self.add_argument("--act_bits", default=None, type=int, help="activation bits") self.add_argument("--act_group_size", default=None, type=int, help="activation group size") self.add_argument( - "--super_group_size", default=None, type=int, help="the number of super group size when use double quant.") + "--super_group_size", default=None, type=int, help="the number of super group size when use double quant." + ) self.add_argument( - "--super_bits", default=None, type=int, help="number of scale and mins quant bits for double quant.") + "--super_bits", default=None, type=int, help="number of scale and mins quant bits for double quant." + ) self.add_argument("--act_data_type", "--act_dtype", default=None, type=str, help="activation data type") self.add_argument("--disable_act_dynamic", action="store_true", help="activation static quantization") @@ -91,7 +94,8 @@ def __init__(self, *args, **kwargs): ) self.add_argument( - "--dataset", default="NeelNanda/pile-10k", type=str, help="the dataset for quantization training") + "--dataset", default="NeelNanda/pile-10k", type=str, help="the dataset for quantization training" + ) self.add_argument( "--minmax_lr", @@ -127,17 +131,20 @@ def __init__(self, *args, **kwargs): ) self.add_argument( - "--output_dir", default="./tmp_autoround", type=str, help="the directory to save quantized model") + "--output_dir", default="./tmp_autoround", type=str, help="the directory to save quantized model" + ) self.add_argument("--disable_amp", action="store_true", help="disable amp") self.add_argument( - "--disable_minmax_tuning", action="store_true", help="whether to disable enable weight minmax tuning") + "--disable_minmax_tuning", action="store_true", help="whether to disable enable weight minmax tuning" + ) self.add_argument("--enable_norm_bias_tuning", action="store_true", help="whether to enable norm bias tuning") self.add_argument( - "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code") + "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code" + ) self.add_argument( "--disable_quanted_input", @@ -178,7 +185,8 @@ def __init__(self, *args, **kwargs): ) self.add_argument( - "--fp_layers", default="", type=str, help="list of Layer names to maintain original data type") + "--fp_layers", default="", type=str, help="list of Layer names to maintain original data type" + ) self.add_argument( "--not_use_best_mse", @@ -200,9 +208,11 @@ def __init__(self, *args, **kwargs): self.add_argument( "--disable_deterministic_algorithms", action="store_true", - help="deprecated, disable torch deterministic algorithms.") + help="deprecated, disable torch deterministic algorithms.", + ) self.add_argument( - "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms.") + "--enable_deterministic_algorithms", action="store_true", help="enable torch deterministic algorithms." + ) self.add_argument( "--disable_opt_rtn", @@ -237,7 +247,8 @@ def __init__(self, *args, **kwargs): ## ======================= eval ======================= self.add_argument( - "--disable_eval", action="store_true", help="whether to disable lm-eval evaluation after tuning") + "--disable_eval", action="store_true", help="whether to disable lm-eval evaluation after tuning" + ) self.add_argument( "--tasks", @@ -263,7 +274,8 @@ def __init__(self, *args, **kwargs): self.add_argument("--eval_task_by_task", action="store_true", help="whether to eval task by task.") self.add_argument( - "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation.") + "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation." + ) class EvalArgumentParser(argparse.ArgumentParser): @@ -271,7 +283,8 @@ class EvalArgumentParser(argparse.ArgumentParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.add_argument( - "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path") + "--model", "--model_name", "--model_name_or_path", default="facebook/opt-125m", help="model name or path" + ) self.add_argument("--mllm", action="store_true", help="whether to eval multi-modal model.") self.add_argument( "--device_map", @@ -294,11 +307,13 @@ def __init__(self, *args, **kwargs): help="lm-eval tasks", ) self.add_argument( - "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code") + "--disable_trust_remote_code", action="store_true", help="whether to disable trust_remote_code" + ) self.add_argument("--eval_bs", "--bs", "--batch_size", default=None, type=int, help="batch size in evaluation") self.add_argument("--eval_task_by_task", action="store_true", help="whether to eval task by task.") self.add_argument( - "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation.") + "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation." + ) self.add_argument( "--limit", type=float, @@ -317,12 +332,14 @@ def setup_parser(): parser.add_argument("--iters", "--iter", default=200, type=int, help="iteration to tune each block") parser.add_argument( - "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples") + "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples" + ) parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples") parser.add_argument( - "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically") + "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" + ) args = parser.parse_args() return args @@ -336,12 +353,14 @@ def setup_best_parser(): parser.add_argument("--iters", "--iter", default=1000, type=int, help="iterations to tune each block") parser.add_argument( - "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples") + "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples" + ) parser.add_argument("--nsamples", "--nsample", default=512, type=int, help="number of samples") parser.add_argument( - "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically") + "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" + ) args = parser.parse_args() args.low_gpu_mem_usage = True @@ -357,12 +376,14 @@ def setup_light_parser(): parser.add_argument("--iters", "--iter", default=50, type=int, help="iterations to tune each block") parser.add_argument( - "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples") + "--seqlen", "--seq_len", default=2048, type=int, help="sequence length of the calibration samples" + ) parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples") parser.add_argument( - "--lr", default=5e-3, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically") + "--lr", default=5e-3, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" + ) args = parser.parse_args() @@ -377,12 +398,14 @@ def setup_fast_parser(): parser.add_argument("--iters", default=200, type=int, help="iterations to tune each block") parser.add_argument( - "--seqlen", "--seq_len", default=512, type=int, help="sequence length of the calibration samples") + "--seqlen", "--seq_len", default=512, type=int, help="sequence length of the calibration samples" + ) parser.add_argument("--nsamples", "--nsample", default=128, type=int, help="number of samples") parser.add_argument( - "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically") + "--lr", default=None, type=float, help="learning rate, if None, it will be set to 1.0/iters automatically" + ) args = parser.parse_args() @@ -418,7 +441,8 @@ def tune(args): if "auto_gptq" in args.format and args.asym is True: logger.warning( "the auto_gptq kernel has issues with asymmetric quantization. " - "It is recommended to use sym quantization or --format='auto_round'") + "It is recommended to use sym quantization or --format='auto_round'" + ) if "marlin" in args.format and args.asym is True: raise RuntimeError("marlin backend only supports sym quantization, please remove --asym") @@ -432,7 +456,8 @@ def tune(args): if args.enable_torch_compile: logger.info( "`torch.compile` is enabled to reduce tuning costs. " - "If it causes issues, you can disable it by removing `--enable_torch_compile` argument.") + "If it causes issues, you can disable it by removing `--enable_torch_compile` argument." + ) model_name = args.model if model_name[-1] == "/": @@ -487,8 +512,12 @@ def tune(args): if len(not_quantize_layer_names) > 0: logger.info(f"{not_quantize_layer_names} will not be quantized.") for format in formats: - if ("auto_round" not in format and "fake" not in format and "awq" not in format and - "llm_compressor" not in format): + if ( + "auto_round" not in format + and "fake" not in format + and "awq" not in format + and "llm_compressor" not in format + ): # TODO gptq could support some mixed precision config logger.warning(f"mixed precision exporting does not support {format} currently") @@ -497,13 +526,15 @@ def tune(args): if "auto_round" not in format and "fake" not in format: auto_round_formats = [s for s in SUPPORTED_FORMATS if s.startswith("auto_round")] raise ValueError( - f"{format} is not supported for lm-head quantization, please change to {auto_round_formats}") + f"{format} is not supported for lm-head quantization, please change to {auto_round_formats}" + ) if "auto_awq" in args.format: from auto_round.utils import check_awq_gemm_compatibility awq_supported, info = check_awq_gemm_compatibility( - model, args.bits, args.group_size, not args.asym, layer_config) + model, args.bits, args.group_size, not args.asym, layer_config + ) if not awq_supported: logger.warning(f"The AutoAWQ format may not be supported due to {info}") @@ -520,7 +551,8 @@ def tune(args): if args.disable_deterministic_algorithms: logger.warning( "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," - " please use enable_deterministic_algorithms instead. ") + " please use enable_deterministic_algorithms instead. " + ) autoround = round( model=model, tokenizer=tokenizer, @@ -642,12 +674,14 @@ def tune(args): if eval_model_dtype == "float32" or eval_model_dtype == "auto": logger.warning( "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," - " but may affect accuracy.") + " but may affect accuracy." + ) if gguf_file is None: logger.error("Cannot find correct gguf file for evaluation, please check.") sys.exit(-1) model = AutoModelForCausalLM.from_pretrained( - eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype) + eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype + ) model.eval() tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file) else: @@ -707,7 +741,8 @@ def tune(args): from auto_round.eval.evaluation import simple_evaluate tasks, model_args, device_str = _eval_init( - args.tasks, eval_folder, args.device_map, args.disable_trust_remote_code, dtype=eval_model_dtype) + args.tasks, eval_folder, args.device_map, args.disable_trust_remote_code, dtype=eval_model_dtype + ) st = time.time() if "llama" in args.model.lower(): model_args += ",add_bos_token=True" @@ -741,7 +776,8 @@ def eval(args): import time tasks, model_args, device_str = _eval_init( - args.tasks, args.model, args.device_map, args.disable_trust_remote_code, args.eval_model_dtype) + args.tasks, args.model, args.device_map, args.disable_trust_remote_code, args.eval_model_dtype + ) # load after _eval_int in order to make sure import torch after set CUDA_VISIBLE_DEVICES from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model @@ -771,13 +807,16 @@ def eval(args): if eval_model_dtype == "float32" or eval_model_dtype == "auto": logger.warning( "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," - " but may affect accuracy.") + " but may affect accuracy." + ) model = AutoModelForCausalLM.from_pretrained( - model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype) + model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype + ) model.eval() st = time.time() res = simple_evaluate_user_model( - model, tokenizer, tasks=tasks, batch_size=batch_size, device=device_str, limit=args.limit) + model, tokenizer, tasks=tasks, batch_size=batch_size, device=device_str, limit=args.limit + ) print(make_table(res)) print("evaluation running time=%ds" % (time.time() - st)) else: @@ -845,10 +884,12 @@ def eval_task_by_task( if eval_model_dtype == "float32" or eval_model_dtype == "auto": logger.warning( "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," - " but may affect accuracy.") + " but may affect accuracy." + ) model = AutoModelForCausalLM.from_pretrained( - model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype) + model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype + ) model.eval() parallelism = False hflm = HFLM( @@ -876,7 +917,8 @@ def eval_task_by_task( while retry_times: try: res = lm_simple_evaluate( - model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit) + model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit + ) break except Exception as e: cuda_error_msg = traceback.format_exc() @@ -887,7 +929,8 @@ def eval_task_by_task( hflm.batch_sizes[k] = max(v // 2, 1) logger.warning(f"Out of memory, reset batch_size to {hflm.batch_sizes} and re-try.") res = lm_simple_evaluate( - model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit) + model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit + ) hflm.batch_sizes = ori_batch_sizes except Exception as e: traceback.print_exc() diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index b1637dbfa..bbb2b42eb 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -173,7 +173,9 @@ def __init__(self, *args, **kwargs): self.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch compile") self.add_argument( - "--disable_deterministic_algorithms", action="store_true", help="deprecated, disable torch deterministic algorithms." + "--disable_deterministic_algorithms", + action="store_true", + help="deprecated, disable torch deterministic algorithms.", ) self.add_argument( From cca567e33e4c45886a57252acd14e7083580130f Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 9 Sep 2025 22:24:48 +0800 Subject: [PATCH 8/8] set use_deterministic_algorithms with warn_only=True as default --- auto_round/autoround.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 35786ab91..03e012578 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -235,7 +235,8 @@ def __init__( if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") - + if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # deprecated, default not to use torch.use_deterministic_algorithms if not disable_deterministic_algorithms or enable_deterministic_algorithms: if not disable_deterministic_algorithms: @@ -243,9 +244,10 @@ def __init__( "default not use deterministic_algorithms. disable_deterministic_algorithms is deprecated," " please use enable_deterministic_algorithms instead. " ) - if "CUBLAS_WORKSPACE_CONFIG" not in os.environ: - os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" + torch.use_deterministic_algorithms(True, warn_only=False) + else: + torch.use_deterministic_algorithms(True, warn_only=True) if device is not None: logger.warning("`device` is deprecated, please use `device_map` instead")