From 6f4f7233b98c1be6c1ce92a583d331bf3972029c Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Tue, 27 May 2025 23:03:57 -0700 Subject: [PATCH 1/3] Update `--int8` flag to `--optimized` flag Signed-off-by: Sun, Xuehao --- examples/3.x_api/pytorch/cv/static_quant/main.py | 6 +++--- .../pytorch/cv/static_quant/run_benchmark.sh | 8 ++++---- .../stable_diffusion/smooth_quant/main.py | 4 ++-- .../stable_diffusion/smooth_quant/run_benchmark.sh | 8 ++++---- .../quantization/static_quant/ipex/main.py | 8 ++++---- .../static_quant/ipex/run_benchmark.sh | 8 ++++---- .../quantization/auto_round/run_benchmark.sh | 6 +++--- .../quantization/fp8_quant/run_benchmark.sh | 6 +++--- .../quantization/smooth_quant/run_benchmark.sh | 8 ++++---- .../smooth_quant/run_clm_no_trainer.py | 6 +++--- .../static_quant/ipex/run_benchmark.sh | 8 ++++---- .../static_quant/ipex/run_clm_no_trainer.py | 6 +++--- .../static_quant/pt2e/run_benchmark.sh | 8 ++++---- .../static_quant/pt2e/run_clm_no_trainer.py | 6 +++--- .../weight_only/text-generation/run_benchmark.sh | 6 +++--- .../quantization/weight_only/run_benchmark.sh | 6 +++--- .../static_quant/ipex/run_benchmark.sh | 8 ++++---- .../quantization/static_quant/ipex/run_qa.py | 8 ++++---- .../dlrm/static_quant/ipex/dlrm_s_pytorch.py | 10 +++++----- .../dlrm/static_quant/ipex/run_benchmark.sh | 10 +++++----- .../vision_transformer/quantization/ptq/main.py | 4 ++-- .../quantization/ptq/run_benchmark.sh | 8 ++++---- .../quantization/ptq/smoothquant/benchmark.py | 14 +++++++------- .../quantization/ptq/smoothquant/run_benchmark.sh | 10 +++++----- 24 files changed, 90 insertions(+), 90 deletions(-) diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py index 655723221f0..64aeb75258a 100644 --- a/examples/3.x_api/pytorch/cv/static_quant/main.py +++ b/examples/3.x_api/pytorch/cv/static_quant/main.py @@ -86,8 +86,8 @@ help='For accuracy measurement only.') parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', help='path to checkpoint tuned by Neural Compressor (default: ./)') -parser.add_argument('--int8', dest='int8', action='store_true', - help='Load int8 model.') +parser.add_argument('--optimized', dest='optimized', action='store_true', + help='Load optimized model.') parser.add_argument("--calib_iters", default=128, type=int, help="For calibration only.") parser.add_argument("--iters", default=100, type=int, @@ -222,7 +222,7 @@ def eval_func(model): return if args.performance or args.accuracy: - if args.int8: + if args.optimized: from neural_compressor.torch.quantization import load q_model = load(args.tuned_checkpoint) diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh index cff26385b50..c527e65d504 100644 --- a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh +++ b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh @@ -35,8 +35,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -63,8 +63,8 @@ function run_benchmark { echo "Error: No such mode: ${mode}" exit 1 fi - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + if [[ ${optimized} == "true" ]]; then + extra_cmd=$extra_cmd" --optimized" fi echo $extra_cmd diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py index e3bf4b1d7f1..c4c6632953b 100644 --- a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py +++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/main.py @@ -325,7 +325,7 @@ def __call__( parser.add_argument('--precision', default='fp32', type=str) parser.add_argument('--base-output-dir', default="./output", type=str) parser.add_argument('--quantized-unet', default="./saved_results", type=str) -parser.add_argument("--int8", action="store_true", help="Load quantized model.") +parser.add_argument("--optimized", action="store_true", help="Load quantized model.") parser.add_argument("--load", action="store_true") parser.add_argument('--iters', default=5000, type=int, help="Num of image generated.") parser.add_argument('--output-dir-name', default=None, type=str) @@ -411,7 +411,7 @@ def __call__( variant="fp16" if args.precision == 'fp16' else None, torch_dtype=dtype) -if args.int8 and args.load: +if args.optimized and args.load: from neural_compressor.torch.quantization import load example_inputs = {"sample": torch.randn((2, 4, 128, 128), dtype=dtype), "timestep": torch.tensor(951.0), diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_benchmark.sh b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_benchmark.sh index 09231111e69..181ff3a7c90 100644 --- a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_benchmark.sh +++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/run_benchmark.sh @@ -20,8 +20,8 @@ function init_params { --iters=*) iters=$(echo $var | cut -f2 -d=) ;; - --int8=*) - int8=$(echo $var | cut -f2 -d=) + --optimized=*) + optimized=$(echo $var | cut -f2 -d=) ;; --mode=*) mode=$(echo $var | cut -f2 -d=) @@ -43,8 +43,8 @@ function run_benchmark { latent="latents.pt" base_output_dir="./output/" - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + if [[ ${optimized} == "true" ]]; then + extra_cmd=$extra_cmd" --optimized" fi echo $extra_cmd diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/main.py b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/main.py index a308aacad35..26c6c79a358 100644 --- a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/main.py +++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/main.py @@ -102,7 +102,7 @@ help='For accuracy measurement only.') parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', help='path to checkpoint tuned by Neural Compressor (default: ./)') -parser.add_argument('--int8', dest='int8', action='store_true', +parser.add_argument('--optimized', dest='optimized', action='store_true', help='run benchmark') parser.add_argument('--ipex', dest='ipex', action='store_true', help='tuning or benchmark with Intel PyTorch Extension') @@ -196,7 +196,7 @@ def main_worker(gpu, ngpus_per_node, args): else: model = quantize_models.__dict__[args.arch]() - if args.ipex and not args.int8: + if args.ipex and not args.optimized: model = model.to(memory_format=torch.channels_last) if not torch.cuda.is_available(): @@ -333,8 +333,8 @@ def run_fn(model): if args.performance or args.accuracy: model.eval() - if args.int8: - print("load int8 model") + if args.optimized: + print("load optimized model") from neural_compressor.torch.quantization import load model = load(os.path.abspath(os.path.expanduser(args.tuned_checkpoint))) else: diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_benchmark.sh index f5a2e251554..2623750afa8 100644 --- a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_benchmark.sh +++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/run_benchmark.sh @@ -34,8 +34,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --xpu=*) xpu=$(echo ${var} |cut -f2 -d=) @@ -66,8 +66,8 @@ function run_benchmark { extra_cmd=$extra_cmd" --hub" fi - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + if [[ ${optimized} == "true" ]]; then + extra_cmd=$extra_cmd" --optimized" fi if [[ ${xpu} == "true" ]]; then diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh index 24e52d179ca..e371eaa1b1e 100644 --- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh +++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/run_benchmark.sh @@ -35,8 +35,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -64,7 +64,7 @@ function run_benchmark { exit 1 fi - if [[ ${int8} == "true" ]]; then + if [[ ${optimized} == "true" ]]; then extra_cmd=$extra_cmd" --load" fi echo $extra_cmd diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/run_benchmark.sh index 002e9527016..4454ee2eba1 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/fp8_quant/run_benchmark.sh @@ -35,8 +35,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -94,7 +94,7 @@ function run_benchmark { python_cmd="deepspeed --num_gpus 8" fi - if [[ ${int8} == "true" ]]; then + if [[ ${optimized} == "true" ]]; then ${python_cmd} quantize.py \ --model ${tuned_checkpoint} \ --load\ diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh index 7b60727b047..463da6c816a 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh @@ -37,8 +37,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -68,8 +68,8 @@ function run_benchmark { exit 1 fi - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + if [[ ${optimized} == "true" ]]; then + extra_cmd=$extra_cmd" --optimized" fi echo $extra_cmd diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index c77d1b77af4..6d9d8947c79 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -29,7 +29,7 @@ parser.add_argument( "--approach", type=str, default="static", help="Select from ['dynamic', 'static', 'weight-only']" ) -parser.add_argument("--int8", action="store_true") +parser.add_argument("--optimized", action="store_true") parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") parser.add_argument("--load", action="store_true", help="Load quantized model.") parser.add_argument("--accuracy", action="store_true") @@ -217,8 +217,8 @@ def eval_func(model): if args.load: - if args.int8 or args.int8_bf16_mixed: - print("load int8 model") + if args.optimized or args.int8_bf16_mixed: + print("load optimized model") from neural_compressor.torch.quantization import load tokenizer = AutoTokenizer.from_pretrained(args.model) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh index b62a6381b20..ced864b1aa5 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh @@ -37,8 +37,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -68,8 +68,8 @@ function run_benchmark { exit 1 fi - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + if [[ ${optimized} == "true" ]]; then + extra_cmd=$extra_cmd" --optimized" fi echo $extra_cmd diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py index b6979180811..146b94c5e4b 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py @@ -35,7 +35,7 @@ ) parser.add_argument("--approach", type=str, default='static', help="Select from ['dynamic', 'static', 'weight-only']") -parser.add_argument("--int8", action="store_true") +parser.add_argument("--optimized", action="store_true") parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") parser.add_argument("--load", action="store_true", help="Load quantized model.") parser.add_argument("--accuracy", action="store_true") @@ -198,8 +198,8 @@ def run_fn(model): user_model.save(args.output_dir) if args.load: - if args.int8 or args.int8_bf16_mixed: - print("load int8 model") + if args.optimized or args.int8_bf16_mixed: + print("load optimized model") from neural_compressor.torch.quantization import load tokenizer = AutoTokenizer.from_pretrained(args.model) config = AutoConfig.from_pretrained(args.model) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh index 169142cddb8..ae7f226eb4b 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh @@ -36,8 +36,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -67,8 +67,8 @@ function run_benchmark { exit 1 fi - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + if [[ ${optimized} == "true" ]]; then + extra_cmd=$extra_cmd" --optimized" fi echo $extra_cmd diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py index 4019242ec51..e1e2ffe28e6 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py @@ -18,7 +18,7 @@ parser.add_argument("--quantize", action="store_true") parser.add_argument("--approach", type=str, default='static', help="Select from ['dynamic', 'static', 'weight-only']") -parser.add_argument("--int8", action="store_true") +parser.add_argument("--optimized", action="store_true") parser.add_argument("--accuracy", action="store_true") parser.add_argument("--performance", action="store_true") parser.add_argument("--calib_iters", default=2, type=int, @@ -102,9 +102,9 @@ def get_example_inputs(tokenizer): converted_model.save(example_inputs=example_inputs, output_dir = args.output_dir) -if args.int8: +if args.optimized: if args.output_dir: - print("Load int8 model.") + print("Load optimized model.") from neural_compressor.torch.quantization import load model_config = user_model.config user_model = load(args.output_dir) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_benchmark.sh index 1ed8c54b1ce..ea6ff4d63b1 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_benchmark.sh @@ -36,8 +36,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -238,7 +238,7 @@ function run_benchmark { script="run_generation_cpu_woq.py" fi fi - if [[ ${int8} == "true" ]] && [[ "$model_source" != "huggingface" ]]; then + if [[ ${optimized} == "true" ]] && [[ "$model_source" != "huggingface" ]]; then model_name_or_path=$tuned_checkpoint fi if [[ $backend == "neuralspeed" ]]; then diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index 8e536418bb2..97d678b772f 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -37,8 +37,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -66,7 +66,7 @@ function run_benchmark { exit 1 fi - if [[ ${int8} == "true" ]]; then + if [[ ${optimized} == "true" ]]; then extra_cmd=$extra_cmd" --load" fi echo $extra_cmd diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_benchmark.sh index 2f646afacdb..40b8104a975 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_benchmark.sh @@ -34,8 +34,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -65,8 +65,8 @@ function run_benchmark { fi extra_cmd="" - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + if [[ ${optimized} == "true" ]]; then + extra_cmd=$extra_cmd" --optimized" fi if [[ ${xpu} == "true" ]]; then extra_cmd=$extra_cmd" --xpu" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_qa.py b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_qa.py index 079c0749994..e50bc6ba4a9 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_qa.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/question-answering/quantization/static_quant/ipex/run_qa.py @@ -98,8 +98,8 @@ class ModelArguments: default=False, metadata={"help": "Whether or not to apply quantization."}, ) - int8: bool = field( - default=False, metadata={"help": "use int8 model to get accuracy or benchmark"} + optimized: bool = field( + default=False, metadata={"help": "use optimized model to get accuracy or benchmark"} ) benchmark: bool = field( default=False, metadata={"help": "get benchmark instead of accuracy"} @@ -699,8 +699,8 @@ def run_fn(model): return model.eval() - if model_args.int8: - print("load int8 model") + if model_args.optimized: + print("load optimized model") from neural_compressor.torch.quantization import load model = load(os.path.abspath(os.path.expanduser(training_args.output_dir))) else: diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py index 0ae812a182f..80e673cdf1f 100644 --- a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/dlrm_s_pytorch.py @@ -406,12 +406,12 @@ def trace_or_load_model(args, dlrm, test_ld, inplace=True): if args.inference_only: dlrm.emb_l.bfloat16() dlrm = ipex.optimize(dlrm, dtype=torch.bfloat16, inplace=inplace) - elif args.int8 and not args.tune: + elif args.optimized and not args.tune: if args.num_cpu_cores != 0: torch.set_num_threads(args.num_cpu_cores) from neural_compressor.torch.quantization import load dlrm = load(args.save_model) - elif args.int8 and args.tune: + elif args.optimized and args.tune: dlrm = dlrm else: dlrm = ipex.optimize(dlrm, dtype=torch.float, inplace=True, auto_kernel_selection=True) @@ -674,7 +674,7 @@ def run(): parser.add_argument("--ipex-interaction", action="store_true", default=False) parser.add_argument("--ipex-merged-emb", action="store_true", default=False) parser.add_argument("--num-warmup-iters", type=int, default=1000) - parser.add_argument("--int8", action="store_true", default=False) + parser.add_argument("--optimized", action="store_true", default=False) parser.add_argument("--dist-backend", type=str, default="ccl") parser.add_argument("--tune", action="store_true", default=False) parser.add_argument("--benchmark", action="store_true", default=False) @@ -820,7 +820,7 @@ def run(): if args.tune: # evaluation def eval_func(model): - args.int8 = getattr(model, "is_quantized", False) + args.optimized = getattr(model, "is_quantized", False) with torch.no_grad(): return inference( args, @@ -828,7 +828,7 @@ def eval_func(model): best_acc_test, best_auc_test, test_ld, - trace=args.int8 + trace=args.optimized ) # calibration diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh index e3c25b0f582..9da844e5545 100755 --- a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/run_benchmark.sh @@ -34,8 +34,8 @@ function init_params { --iters=*) iters=$(echo ${var} |cut -f2 -d=) ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) + --optimized=*) + optimized=$(echo ${var} |cut -f2 -d=) ;; --config=*) tuned_checkpoint=$(echo $var |cut -f2 -d=) @@ -61,9 +61,9 @@ function run_tuning { CORES=`lscpu | grep Core | awk '{print $4}'` ARGS="" - if [[ ${int8} == "true" ]]; then - echo "running int8 path" - ARGS="$ARGS --int8" + if [[ ${optimized} == "true" ]]; then + echo "running optimized path" + ARGS="$ARGS --optimized" else echo "running fp32 path" fi diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/main.py b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/main.py index 92b2ea0fb2a..d7e7f3bd15b 100644 --- a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/main.py +++ b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/main.py @@ -59,7 +59,7 @@ help='location of calibration dataset and evaluate dataset') arg_parser.add_argument('--batch_size', type=int, default=32, dest='batch_size', help='batch_size of benchmark') arg_parser.add_argument('--iters', type=int, default=100, dest='iters', help='interations') -arg_parser.add_argument('--int8', dest='int8', action='store_true', help='whether to use int8 model for benchmark') +arg_parser.add_argument('--optimized', dest='optimized', action='store_true', help='whether to use optimized model for benchmark') args = arg_parser.parse_args() def evaluate(model, eval_dataloader, preprocess=None): @@ -161,7 +161,7 @@ def run(self): ) dataloader = TFDataLoader(dataset=dataset, batch_size=args.batch_size) - if args.int8 or args.input_graph.endswith("-tune.pb"): + if args.optimized or args.input_graph.endswith("-tune.pb"): input_graph = args.input_graph else: sm = saved_model_pb2.SavedModel() diff --git a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_benchmark.sh b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_benchmark.sh index 2348865d66e..baaea9b7379 100644 --- a/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_benchmark.sh +++ b/examples/3.x_api/tensorflow/image_recognition/vision_transformer/quantization/ptq/run_benchmark.sh @@ -31,8 +31,8 @@ function init_params { --iters=*) iters=$(echo $var |cut -f2 -d=) ;; - --int8=*) - int8=$(echo $var |cut -f2 -d=) + --optimized=*) + optimized=$(echo $var |cut -f2 -d=) ;; esac done @@ -41,8 +41,8 @@ function init_params { # run_tuning function run_benchmark { - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" + if [[ ${optimized} == "true" ]]; then + extra_cmd=$extra_cmd" --optimized" fi python main.py \ --input-graph ${input_model} \ diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py index 673d50c034f..eda0d222d39 100644 --- a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py @@ -28,7 +28,7 @@ sys.path.insert(0, './') parser = argparse.ArgumentParser() -parser.add_argument('--int8', action='store_true', help="eval fp32 model or int8 model") +parser.add_argument('--optimized', action='store_true', help="eval fp32 model or optimized model") parser.add_argument('--model_name_or_path', type=str, default='facebook/opt-125m') parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--warmup', type=int, default=10) @@ -173,13 +173,13 @@ def __len__(self): evaluator = Evaluator(eval_dataset, tokenizer, 'cpu') -if args.int8: - print("benchmarking int8 model") - int8_folder = model_name.split('/')[-1] + "_int8" - if not os.path.exists(int8_folder): - print(f"could not find int8 folder {int8_folder} ") +if args.optimized: + print("benchmarking optimized model") + optimized_folder = model_name.split('/')[-1] + "_int8" + if not os.path.exists(optimized_folder): + print(f"could not find optimized folder {optimized_folder} ") exit() - model = tf.saved_model.load(int8_folder) # tensorflow.python.trackable.autotrackable.AutoTrackable object + model = tf.saved_model.load(optimized_folder) # tensorflow.python.trackable.autotrackable.AutoTrackable object else: print("benchmaking fp32 model") model = transformers.TFAutoModelForCausalLM.from_pretrained(model_name) diff --git a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_benchmark.sh b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_benchmark.sh index b8fad17eebd..fed1991e4ca 100644 --- a/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_benchmark.sh +++ b/examples/3.x_api/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/run_benchmark.sh @@ -10,7 +10,7 @@ function main { # init params function init_params { - int8=false + optimized=false batch_size=16 for var in "$@" do @@ -18,8 +18,8 @@ function init_params { --input_model=*) input_model=$(echo $var |cut -f2 -d=) ;; - --int8=*) - int8=$(echo $var |cut -f2 -d=) + --optimized=*) + optimized=$(echo $var |cut -f2 -d=) ;; --batch_size=*) batch_size=$(echo $var |cut -f2 -d=) @@ -31,11 +31,11 @@ function init_params { # run_tuning function run_benchmark { - if [[ "${int8}" == "true" ]]; then + if [[ "${optimized}" == "true" ]]; then python benchmark.py \ --model_name_or_path ${input_model} \ --batch_size ${batch_size} \ - --int8 + --optimized else python benchmark.py \ --model_name_or_path ${input_model} \ From 884e1fcf3b1d3b819c2e69483137a4900417c29f Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Wed, 28 May 2025 22:54:47 -0700 Subject: [PATCH 2/3] remove model Signed-off-by: Sun, Xuehao --- .azure-pipelines/model-test-3x.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.azure-pipelines/model-test-3x.yml b/.azure-pipelines/model-test-3x.yml index e32f5e92f6d..708886280ef 100644 --- a/.azure-pipelines/model-test-3x.yml +++ b/.azure-pipelines/model-test-3x.yml @@ -33,7 +33,6 @@ parameters: default: - opt_125m_woq_gptq_int4 - opt_125m_woq_gptq_nf4_dq_bnb - - opt_125m_woq_gptq_int4_dq_ggml stages: - stage: PyTorchModels From 1d30ba60ae9fce2c7a6a1677574967d24c2e5302 Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Tue, 3 Jun 2025 06:03:00 -0700 Subject: [PATCH 3/3] update readme Signed-off-by: Sun, Xuehao --- examples/3.x_api/pytorch/cv/fp8_quant/main.py | 6 +++--- .../diffusers/stable_diffusion/smooth_quant/README.md | 4 ++-- .../stable_diffusion/smooth_quant/sdxl_smooth_quant.py | 4 ++-- .../quantization/static_quant/ipex/README.md | 8 ++++---- .../quantization/static_quant/pt2e/README.md | 4 ++-- .../recommendation/dlrm/static_quant/ipex/README.md | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/3.x_api/pytorch/cv/fp8_quant/main.py b/examples/3.x_api/pytorch/cv/fp8_quant/main.py index dfa7515343c..71e0c330f31 100644 --- a/examples/3.x_api/pytorch/cv/fp8_quant/main.py +++ b/examples/3.x_api/pytorch/cv/fp8_quant/main.py @@ -60,7 +60,7 @@ parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('-t', '--tune', dest='tune', action='store_true', - help='tune best int8 model on calibration dataset') + help='tune best fp8 model on calibration dataset') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--world-size', default=-1, type=int, @@ -94,7 +94,7 @@ help='For accuracy measurement only.') parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', help='path to checkpoint tuned by Neural Compressor (default: ./)') -parser.add_argument('--int8', dest='int8', action='store_true', +parser.add_argument('--optimized', dest='optimized', action='store_true', help='run benchmark') parser.add_argument('--device', default='hpu', type=str, help='use hpu device for fp8 quantization') @@ -205,7 +205,7 @@ def eval_func(model): if args.performance or args.accuracy: model.eval() - if args.int8: + if args.optimized: from neural_compressor.utils.pytorch import load new_model = load(os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model, diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/README.md b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/README.md index 6b37038d0dc..57a40cafcfb 100644 --- a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/README.md +++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/README.md @@ -24,7 +24,7 @@ sh run_quant.sh --alpha=0.44 ``` To load a quantized model: ```bash -python sdxl_smooth_quant.py --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 --quantize --load --int8 +python sdxl_smooth_quant.py --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 --quantize --load --optimized ``` or ```bash @@ -79,5 +79,5 @@ python clip/clip_score.py \ ``` Or you can use the bash script for all steps above: ```bash -sh run_benchmark.sh --mode=accuracy --int8=true +sh run_benchmark.sh --mode=accuracy --optimized=true ``` diff --git a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py index 8e9c383b80d..613c1e6f3ae 100644 --- a/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py +++ b/examples/3.x_api/pytorch/diffusion_model/diffusers/stable_diffusion/smooth_quant/sdxl_smooth_quant.py @@ -339,7 +339,7 @@ def main(): ) parser.add_argument("--quantize", action="store_true") parser.add_argument("--load", action="store_true") - parser.add_argument("--int8", action="store_true", help="Load quantized model.") + parser.add_argument("--optimized", action="store_true", help="Load quantized model.") parser.add_argument("--performance", action="store_true") parser.add_argument("--n_steps", type=int, default=20) parser.add_argument("--batch-size", type=int, default=1) @@ -404,7 +404,7 @@ def forward_loop(model): q_unet.save(args.output_dir) if args.load: - if args.int8: + if args.optimized: from neural_compressor.torch.quantization import load q_unet = load(os.path.abspath(os.path.expanduser(args.output_dir))) else: diff --git a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/README.md b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/README.md index 21a7a0884a3..ade6b9550eb 100644 --- a/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/README.md +++ b/examples/3.x_api/pytorch/image_recognition/torchvision_models/quantization/static_quant/ipex/README.md @@ -59,7 +59,7 @@ python main.py -t -a resnet18 --ipex --pretrained /path/to/imagenet or ```shell bash run_quant.sh --input_model=resnet18 --dataset_location=/path/to/imagenet -bash run_benchmark.sh --input_model=resnet18 --dataset_location=/path/to/imagenet --mode=performance/accuracy --int8=true/false +bash run_benchmark.sh --input_model=resnet18 --dataset_location=/path/to/imagenet --mode=performance/accuracy --optimized=true/false ``` ### 2. ResNet50 With Intel PyTorch Extension @@ -70,7 +70,7 @@ python main.py -t -a resnet50 --ipex --pretrained /path/to/imagenet or ```shell bash run_quant.sh --input_model=resnet50 --dataset_location=/path/to/imagenet -bash run_benchmark.sh --input_model=resnet50 --dataset_location=/path/to/imagenet --mode=performance/accuracy --int8=true/false +bash run_benchmark.sh --input_model=resnet50 --dataset_location=/path/to/imagenet --mode=performance/accuracy --optimized=true/false ``` ### 3. ResNext101_32x16d With Intel PyTorch Extension @@ -81,7 +81,7 @@ python main.py -t -a resnext101_32x16d_wsl --hub --ipex --pretrained /path/to/im or ```shell bash run_quant.sh --input_model=resnext101_32x16d_wsl --dataset_location=/path/to/imagenet -bash run_benchmark.sh --input_model=resnext101_32x16d_wsl --dataset_location=/path/to/imagenet --mode=performance/accuracy --int8=true/false +bash run_benchmark.sh --input_model=resnext101_32x16d_wsl --dataset_location=/path/to/imagenet --mode=performance/accuracy --optimized=true/false ``` # Run with Intel GPU @@ -96,5 +96,5 @@ python main.py -t -a resnet18 --ipex --pretrained /path/to/imagenet --xpu or ```shell bash run_quant.sh --input_model=resnet18 --dataset_location=/path/to/imagenet -bash run_benchmark.sh --input_model=resnet18 --dataset_location=/path/to/imagenet --mode=performance/accuracy --int8=true/false --xpu=true/false +bash run_benchmark.sh --input_model=resnet18 --dataset_location=/path/to/imagenet --mode=performance/accuracy --optimized=true/false --xpu=true/false ``` diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md index 4e9f09ab858..fd153b73775 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md @@ -32,7 +32,7 @@ python run_clm_no_trainer.py --model facebook/opt-125m --quantize --output_dir q python run_clm_no_trainer.py --model facebook/opt-125m --accuracy --tasks lambada_openai # Measure the accuracy of the quantized model -python run_clm_no_trainer.py --model facebook/opt-125m --accuracy --tasks lambada_openai --int8 --output_dir qmodel_save_path +python run_clm_no_trainer.py --model facebook/opt-125m --accuracy --tasks lambada_openai --optimized --output_dir qmodel_save_path ``` #### Performance @@ -41,5 +41,5 @@ python run_clm_no_trainer.py --model facebook/opt-125m --accuracy --tasks lamba python run_clm_no_trainer.py --model facebook/opt-125m --performance # Measure the performance of the quantized model -python run_clm_no_trainer.py --model facebook/opt-125m --performance --int8 --output_dir qmodel_save_path +python run_clm_no_trainer.py --model facebook/opt-125m --performance --optimized --output_dir qmodel_save_path ``` diff --git a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md index 918cc1edc23..38a87af55f7 100644 --- a/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md +++ b/examples/3.x_api/pytorch/recommendation/dlrm/static_quant/ipex/README.md @@ -42,7 +42,7 @@ PyTorch 1.11 or higher version is needed with pytorch_fx backend. ### benchmark ```shell -bash run_benchmark.sh --input_model="/path/of/pretrained/model" --dataset_location="/path/of/dataset" --mode=accuracy --int8=true +bash run_benchmark.sh --input_model="/path/of/pretrained/model" --dataset_location="/path/of/dataset" --mode=accuracy --optimized=true ```