intel · chensuyue · Oct 17, 2024 · Oct 11, 2024 · Oct 11, 2024 · Aug 12, 2024
diff --git a/.azure-pipelines/scripts/models/run_model_trigger_common.sh b/.azure-pipelines/scripts/models/run_model_trigger_common.sh
@@ -88,7 +88,7 @@ elif [ "${mode}" == "tuning" ]; then
     cd ${WORK_SOURCE_DIR}/${model_src_dir}
     # for int4 models add "--accuracy" to run tuning after quantize
     if [[ "${model}" == *"int4"* ]]; then
-        sed -i "s|--quantize|--quantize --accuracy --int8|g" run_quant.sh
+        sed -i "s|--quantize|--quantize --accuracy --load|g" run_quant.sh
     fi
 
     $BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET

diff --git a/.azure-pipelines/scripts/ut/3x/coverage.3x_pt b/.azure-pipelines/scripts/ut/3x/coverage.3x_pt
@@ -7,6 +7,7 @@ include =
  */neural_compressor/torch/*
 omit =
  */neural_compressor/torch/algorithms/fp8_quant/*
+ */neural_compressor/torch/algorithms/mixed_low_precision/*
  */neural_compressor/torch/amp/*
 exclude_lines =
  pragma: no cover

diff --git a/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8 b/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8
@@ -4,6 +4,7 @@ branch = True
 [report]
 include =
  */neural_compressor/torch/algorithms/fp8_quant/*
+ */neural_compressor/torch/algorithms/mixed_low_precision/*
 exclude_lines =
  pragma: no cover
  raise NotImplementedError

diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
@@ -10,7 +10,6 @@ sed -i '/^intel_extension_for_pytorch/d' /neural-compressor/test/3x/torch/requir
 sed -i '/^auto_round/d' /neural-compressor/test/3x/torch/requirements.txt
 cat /neural-compressor/test/3x/torch/requirements.txt
 pip install -r /neural-compressor/test/3x/torch/requirements.txt
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
 pip install pytest-cov
 pip install pytest-html
 pip install pytest-html-merger
@@ -27,6 +26,7 @@ pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-co
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
 # pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/weight_only/test_autoround.py 2>&1 | tee -a ${ut_log_name}
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_4.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_5.html --self-contained-html torch/algorithms/fp8_quant 2>&1 | tee -a ${ut_log_name}
 
 mkdir -p report && mv *.html report
 pytest_html_merger -i ./report -o ./report.html

diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml
@@ -74,7 +74,7 @@ steps:
 
   - ${{ if eq(parameters.imageSource, 'pull') }}:
       - script: |
-            docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
         displayName: "Pull habana docker image"
 
   - script: |
@@ -95,7 +95,7 @@ steps:
             else
                 docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
                 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
-                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
             fi
             echo "Show the container list after docker run ... "
             docker ps -a

diff --git a/docs/source/3x/PT_FP8Quant.md b/docs/source/3x/PT_FP8Quant.md
@@ -20,15 +20,6 @@ Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 c
 
 ## Supported Parameters
 
-<style type="text/css">
-.tg  {border-collapse:collapse;border-spacing:0;}
-.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
-  overflow:hidden;padding:10px 5px;word-break:normal;}
-.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
-  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
-.tg .tg-fymr{border-color:inherit;font-weight:bold;text-align:left;vertical-align:top}
-.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
-</style>
 <table class="tg"><thead>
   <tr>
     <th class="tg-fymr">Attribute</th>
@@ -74,7 +65,7 @@ Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 c
   <tr>
     <td class="tg-0pky">scale_method</td>
     <td class="tg-0pky">The method for calculating the scale from the measurement.</td>
-    <td class="tg-0pky">- without_scale - Convert to/from FP8 without scaling.<br>- unit_scale - Always use scale of 1.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
+    <td class="tg-0pky">- unit_scale - Always use scale of 1.<br>- hw_aligned_single_scale - Always use scale that's aligned to the corresponding HW accelerated scale.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
   </tr>
   <tr>
     <td class="tg-0pky">measure_exclude</td>

diff --git a/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
@@ -11,3 +11,4 @@ neural-compressor
 lm_eval==0.4.3
 peft
 optimum-intel
+intel_extension_for_pytorch
diff --git a/.../nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/.../nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
@@ -217,7 +217,6 @@ def eval_func(model):
 
 
 if args.load:
-    # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
     if args.int8 or args.int8_bf16_mixed:
         print("load int8 model")
         from neural_compressor.torch.quantization import load

diff --git a/.../nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt b/.../nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
@@ -10,3 +10,4 @@ einops
 neural-compressor
 lm_eval==0.4.3
 peft
+intel_extension_for_pytorch
diff --git a/...huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/...huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
@@ -198,7 +198,6 @@ def run_fn(model):
     user_model.save(args.output_dir)
 
 if args.load:
-    # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
     if args.int8 or args.int8_bf16_mixed:
         print("load int8 model")
         from neural_compressor.torch.quantization import load

diff --git a/...guage-modeling/quantization/transformers/weight_only/text-generation/requirements_GPU.txt b/...guage-modeling/quantization/transformers/weight_only/text-generation/requirements_GPU.txt
@@ -13,3 +13,5 @@ tiktoken  #qwen
 einops  #qwen
 auto_round
 lm-eval==0.4.3
+numba
+tbb
diff --git a/...e-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt b/...e-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt
@@ -13,3 +13,5 @@ einops  #qwen
 auto_round
 lm-eval==0.4.3
 huggingface_hub
+numba
+tbb
diff --git a/...rch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md b/...rch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
@@ -1,179 +1,96 @@
-Step-by-Step
-============
-This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
+Weight-only quantization
+===============
 
-The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
-
-# Prerequisite
-## 1. Create Environment
+##  Prerequisite
 ```
 # Installation
 pip install -r requirements.txt
 ```
 
-# Run
+## Support status on HPU
 
-Here is how to run the scripts:
+Below is the current support status on Intel Gaudi AI Accelerator with PyTorch.
 
-**Causal Language Modeling (CLM)**
+| woq_algo |   Status  |
+|--------------|----------|
+|   GPTQ   |  &#10004;|
 
-`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
-### GPT-J-6b
+> We validated the typical LLMs such as: `meta-llama/Llama-2-7b-hf`, `EleutherAI/gpt-j-6B`, `facebook/opt-125m`.
 
-#### Quantization
+## Support status on CPU
 
-```bash
-# "--woq_algo GPTQ" is used to enable GPTQ algorithms
-# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
-python run_clm_no_trainer.py \
-    --model EleutherAI/gpt-j-6B \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo GPTQ \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128 \
-    --gptq_max_seq_length 2048 \
-    --gptq_use_max_length \
-    --double_quant_type "BNB_NF4" \
-    --output_dir saved_results
+Below is the current support status on Intel® Xeon® Scalable Processor with PyTorch.
 
-# "--woq_algo RTN" is used to enable RTN algorithms
-python run_clm_no_trainer.py \
-    --model EleutherAI/gpt-j-6B \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo RTN \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128 \
-    --double_quant_type "BNB_NF4"
-    --output_dir saved_results
 
-# "--woq_algo AWQ" is used to enable AWQ algorithms
-python run_clm_no_trainer.py \
-    --model EleutherAI/gpt-j-6B \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo AWQ \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128 \
-    --calib_iters 128
+| woq_algo |   status |
+|--------------|----------|
+|       RTN      |  &#10004;  |
+|       GPTQ     |  &#10004;  |
+|       AutoRound|  &#10004;  |
+|       AWQ      |  &#10004;  |
+|       TEQ      |  &#10004;  |
 
-# "--woq_algo AutoRound" is used to enable AutoRound algorithms
-python run_clm_no_trainer.py \
-    --model EleutherAI/gpt-j-6B \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo AutoRound \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128
+> We validated the typical LLMs such as: `meta-llama/Llama-2-7b-hf`, `EleutherAI/gpt-j-6B`, `facebook/opt-125m`.
 
-# "--accuracy" for eval
-python run_clm_no_trainer.py \
-    --model EleutherAI/gpt-j-6B \
-    --dataset NeelNanda/pile-10k \
-    --int8 \
-    --accuracy \
-    --tasks "lambada_openai" \
-    --output_dir saved_results
-```
-**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
 
+## Run
 
-### OPT-125m
+`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates datasets accuracy provided by lm_eval, an example command is as follows.
 
-#### Quantization
+### Quantization
 
 ```bash
-# "--woq_algo GPTQ" is used to enable GPTQ algorithms
-# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
 python run_clm_no_trainer.py \
-    --model facebook/opt-125m \
+    --model meta-llama/Llama-2-7b-hf \
     --dataset NeelNanda/pile-10k \
     --quantize \
+    --batch_size 8 \
     --woq_algo GPTQ \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --double_quant_type "BNB_NF4"
-
-# "--woq_algo RTN" is used to enable RTN algorithms
-python run_clm_no_trainer.py \
-    --model facebook/opt-125m \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo RTN \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128 \
-    --double_quant_type "BNB_NF4"
-
-# "--woq_algo AWQ" is used to enable AWQ algorithms
-python run_clm_no_trainer.py \
-    --model facebook/opt-125m \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo AWQ \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128 \
-    --calib_iters 128
+    --output_dir saved_results
+```
+### Evaluation
 
-# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+```bash
+# original model
 python run_clm_no_trainer.py \
-    --model facebook/opt-125m \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo AutoRound \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128
+    --model meta-llama/Llama-2-7b-hf \
+    --accuracy \
+    --batch_size 8 \
+    --tasks "lambada_openai,wikitext" \
+    --output_dir saved_results
 
-# "--accuracy" for eval
+# quantized model
 python run_clm_no_trainer.py \
-    --model facebook/opt-125m  \
-    --dataset NeelNanda/pile-10k \
-    --int8 \
+    --model meta-llama/Llama-2-7b-hf \
+    --load \
     --accuracy \
-    --tasks "lambada_openai" \
+    --batch_size 8 \
+    --tasks "lambada_openai,wikitext" \
     --output_dir saved_results
 ```
 
-### LLAMA2-7b/13b/70b
-#### Quantization
+### Benchmark
 
 ```bash
-# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
-# "--woq_algo GPTQ" is used to enable GPTQ algorithms
+# original model
 python run_clm_no_trainer.py \
     --model meta-llama/Llama-2-7b-hf \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo GPTQ \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128 \
-    --gptq_max_seq_length 2048 \
-    --gptq_use_max_length \
-    --double_quant_type "BNB_NF4"
+    --performance \
+    --batch_size 8 \
+    --output_dir saved_results
 
-# "--woq_algo RTN" is used to enable RTN algorithms
+# quantized model
 python run_clm_no_trainer.py \
     --model meta-llama/Llama-2-7b-hf \
-    --dataset NeelNanda/pile-10k \
-    --quantize \
-    --woq_algo RTN \
-    --woq_bits 4 \
-    --woq_scheme asym \
-    --woq_group_size 128 \
-    --double_quant_type "BNB_NF4"
+    --load \
+    --performance \
+    --batch_size 8 \
+    --output_dir saved_results
 ```
 
-
-[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
-[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
+For more information about parameter usage, please refer to [PT_WeightOnlyQuant.md](https://github.com/intel/neural-compressor/blob/master/docs/source/3x/PT_WeightOnlyQuant.md)
diff --git a/...ytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt b/...ytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt
@@ -11,4 +11,5 @@ neural-compressor
 lm_eval==0.4.3
 peft
 auto_round
-intel_extension_for_pytorch
+numba
+tbb
diff --git a/...ytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/...ytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -66,7 +66,7 @@ function run_benchmark {
     fi
 
     if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
+        extra_cmd=$extra_cmd" --load"
     fi
     echo $extra_cmd