Support AutoRound quantization method for intel GPU (#1428)

Co-authored-by: kevinintel <hanwen.chang@intel.com> Co-authored-by: Wenxin Zhang <wenxin.zhang@intel.com> Co-authored-by: changwangss <chang1.wang@intel.com>
intel · Apr 2, 2024 · 7084e7f · 7084e7f
1 parent ab2fd05
commit 7084e7f
Show file tree

Hide file tree

Showing 14 changed files with 615 additions and 169 deletions.
diff --git a/docs/weightonlyquant.md b/docs/weightonlyquant.md
@@ -17,7 +17,7 @@ As large language models (LLMs) become more prevalent, there is a growing need f
 | Support Device |  Rtn  |  Awq  |  Teq |  GPTQ  | AutoRound |
 |:--------------:|:----------:|:----------:|:----------:|:----:|:----:|
 |     Intel CPU        |  &#10004;  |  &#10004;  |  &#10004;  |  &#10004;  |  &#10004;  |
-|     Intel GPU        |  &#10004;  |  stay tuned  |  stay tuned  |  stay tuned  |  stay tuned  |
+|     Intel GPU        |  &#10004;  |  stay tuned  |  stay tuned  |  &#10004;  |  &#10004;  |
 
 **RTN**[[1\]](https://github.com/intel/intel-extension-for-transformers/blob/548c13ed2e19cde91729530ca26c3b875c1b3d10/docs/weightonlyquant.md#1)(&#9733;&#9733;&#9733;):   Rounding to Nearest (RTN) is an intuitively simple method that rounds values to the nearest integer. It boasts simplicity, requiring no additional datasets, and offers fast quantization. Besides, it could be easily applied in other datatype like NF4(non-uniform). Typically, it performs well on configurations such as W4G32 or W8, but worse than advanced algorithms at lower precision level.
 
@@ -147,7 +147,10 @@ loaded_model = AutoModelForCausalLM.from_pretrained(saved_dir)
 > Note: For LLM runtime model loading usage, please refer to [neural_speed readme](https://github.com/intel/neural-speed/blob/main/README.md#quick-start-transformer-like-usage)
 
 ## Examples For Intel GPU
-Intel-extension-for-transformers implement weight-only quantization for intel GPU(PVC and ARC) with [Intel-extension-for-pytorch](https://github.com/intel/intel-extension-for-pytorch). Currently, the Linear op kernel of Weight-only quantization is implemented in the Intel-extension-for-pytorch branch: "dev/QLLM".  
+Intel-extension-for-transformers implement weight-only quantization for intel GPU(PVC and ARC) with [Intel-extension-for-pytorch](https://github.com/intel/intel-extension-for-pytorch). Currently, the Linear op kernel of Weight-only quantization is implemented in the Intel-extension-for-pytorch branch: "dev/QLLM".
+
+Now 4-bit/8-bit inference with `RtnConfig`, `AwqConfig`, `GPTQConfig`, `AutoRoundConfig` are support on intel GPU device.
+
 We support experimental woq inference on intel GPU(PVC and ARC) with replacing Linear op in PyTorch. Validated models: Qwen-7B, GPT-J-6B.  
 Here are the example codes.
 

diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -1576,6 +1576,230 @@
       }
     }
   },
+  "mistral_7b_autoround_neuralspeed": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "mistral_7b_int4_autoround",
+        "task": "generation",
+        "backend": "neuralspeed",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_autoround",
+        "task": "generation",
+        "backend": "neuralspeed",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "mistral_7b_gptq_neuralspeed": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "mistral_7b_int4_gptq",
+        "task": "generation",
+        "backend": "neuralspeed",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_gptq",
+        "task": "generation",
+        "mode": "benchmark",
+        "backend": "neuralspeed",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+
+  "mistral_7b_rtn_neuralspeed": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "mistral_7b_int4_rtn",
+        "task": "generation",
+        "backend": "neuralspeed",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_rtn",
+        "task": "generation",
+        "backend": "neuralspeed",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "mistral_7b_autoround": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "mistral_7b_int4_autoround",
+        "task": "generation",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_autoround",
+        "task": "generation",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "mistral_7b_gptq": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "mistral_7b_int4_gptq",
+        "task": "generation",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_gptq",
+        "task": "generation",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+
+  "mistral_7b_rtn": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "mistral_7b_int4_rtn",
+        "task": "generation",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_rtn",
+        "task": "generation",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "mistral_7b_autoround_neuralspeed_hf": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "mistral_7b_int4_autoround",
+        "task": "generation",
+        "backend": "neuralspeed",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_autoround",
+        "task": "generation",
+        "backend": "neuralspeed",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results",
+        "model_source": "huggingface"
+      }
+    }
+  },
+  "mistral_7b_gptq_neuralspeed_hf": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{},
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_gptq",
+        "task": "generation",
+        "mode": "benchmark",
+        "backend": "neuralspeed",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results",
+        "model_source": "huggingface"
+      }
+    }
+  },
+  "mistral_7b_autoround_hf": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{},
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_autoround",
+        "task": "generation",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results",
+        "model_source": "huggingface"
+      }
+    }
+  },
+  "mistral_7b_gptq_hf": {
+    "working_dir": "huggingface/pytorch/text-generation/quantization",
+    "tune":{},
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mistral_7b_int4_gptq",
+        "task": "generation",
+        "mode": "benchmark",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results",
+        "model_source": "huggingface"
+      }
+    }
+  },
   "dolly_v2_3b_gen_ipex_static": {
     "working_dir": "huggingface/pytorch/text-generation/quantization",
     "tune":{

diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -133,6 +133,8 @@ python run_generation.py \
 >**Note**: 
 > 1.  default search algorithm is beam search with num_beams = 1.
 > 2. [ipex.optimize_transformers](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.10%2Bxpu/docs/tutorials/llm/llm_optimize_transformers.md) Support for the optimized inference of model types "gptj," "mistral," "qwen," and "llama" to achieve high performance and accuracy. Ensure accurate inference for other model types as well.
+> 3. We provide compression technologies `WeightOnlyQuant` with `Rtn/GPTQ/AutoRound` algorithms and `load_in_4bit` and `load_in_8bit` work on intel GPU device.
+
 ## Prerequisite
 ### Dependencies
 Intel-extension-for-pytorch dependencies are in oneapi package, before install intel-extension-for-pytorch, we should install oneapi first. Please refer to [Installation Guide](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.10%2Bxpu) to install the OneAPI to "/opt/intel folder".

diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt
@@ -13,6 +13,7 @@ bitsandbytes  #baichuan
 transformers_stream_generator
 tiktoken  #qwen
 einops  #qwen
+neural-speed
 auto-round
 git+https://github.com/intel/neural-compressor.git
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt
@@ -5,7 +5,7 @@ protobuf
 sentencepiece != 0.1.92
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 torch==2.1.0a0
-transformers
+transformers==4.35.2
 optimum-intel
 bitsandbytes  #baichuan
 transformers_stream_generator

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
@@ -51,6 +51,9 @@ function init_params {
       --backend=*)
           backend=$(echo $var |cut -f2 -d=)
       ;;
+      --model_source=*)
+          model_source=$(echo $var |cut -f2 -d=)
+      ;;
       *)
           echo "Error: No such parameter: ${var}"
           exit 1
@@ -150,10 +153,16 @@ function run_benchmark {
         model_name_or_path="Intel/neural-chat-7b-v3"
     elif [ "${topology}" = "phi_1b" ]; then
         model_name_or_path="susnato/phi-1_dev"
-	pip install transformers==4.36.1
+	    pip install transformers==4.36.1
     elif [ "${topology}" = "phi_1_5b" ]; then
         model_name_or_path="susnato/phi-1_5_dev"
-	pip install transformers==4.36.1
+	    pip install transformers==4.36.1
+    elif [ "${topology}" = "llama2_7b_int4_gptq" ] && [ "$model_source" != "huggingface" ]; then
+        model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
+    elif [ "${topology}" = "mistral_7b_int4_autoround" ] && [ "$model_source" != "huggingface" ]; then
+        model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
+    elif [ "${topology}" = "mistral_7b_int4_rtn" ] && [ "$model_source" != "huggingface" ]; then
+        model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
     fi
 
     if [[ ${int8} == "true" ]]; then
@@ -168,9 +177,51 @@ function run_benchmark {
         elif [ "${topology}" = "gpt_j_mp" ]; then
             extra_cmd=$extra_cmd" --mixed_precision"
         elif [ "${topology}" = "llama2_7b_int4_gptq" ]; then
-            model_name_or_path="meta-llama/Llama-2-7b-hf"
-            extra_cmd=$extra_cmd" --woq --bits 4 --weight_dtype int4_clip --compute_dtype fp32 --scheme asym "
-            extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --max_input_length 2048 "
+            if [[ "$model_source" == "huggingface" ]]; then
+                model_name_or_path="TheBloke/Llama-2-7B-Chat-GPTQ"
+            else
+                model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
+                extra_cmd=$extra_cmd" --trust_remote_code"
+                extra_cmd=$extra_cmd" --woq_loading"
+            fi
+            if [[ $backend == "neuralspeed" ]]; then
+                extra_cmd=$extra_cmd" --use_neural_speed"
+            fi
+        elif [ "${topology}" = "mistral_7b_int4_autoround" ]; then
+            if [[ "$model_source" == "huggingface" ]]; then
+                model_name_or_path="Intel/Mistral-7B-v0.1-int4-inc"
+            else
+                model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
+                extra_cmd=$extra_cmd" --trust_remote_code"
+                extra_cmd=$extra_cmd" --woq_loading"
+            fi
+            if [[ $backend == "neuralspeed" ]]; then
+                extra_cmd=$extra_cmd" --use_neural_speed"
+            fi
+
+        elif [ "${topology}" = "mistral_7b_int4_rtn" ]; then
+            if [[ "$model_source" == "huggingface" ]]; then
+                model_name_or_path="mistralai/Mistral-7B-v0.1"
+            else
+                model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
+                extra_cmd=$extra_cmd" --trust_remote_code"
+                extra_cmd=$extra_cmd" --woq_loading"
+            fi
+            if [[ $backend == "neuralspeed" ]]; then
+                extra_cmd=$extra_cmd" --use_neural_speed"
+            fi
+
+        elif [ "${topology}" = "mistral_7b_int4_gptq" ]; then
+            if [[ "$model_source" == "huggingface" ]]; then
+                model_name_or_path="TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
+            else
+                model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
+                extra_cmd=$extra_cmd" --trust_remote_code"
+                extra_cmd=$extra_cmd" --woq_loading"
+            fi
+            if [[ $backend == "neuralspeed" ]]; then
+                extra_cmd=$extra_cmd" --use_neural_speed"
+            fi
         else
             extra_cmd=$extra_cmd" --int8"
         fi