Skip to content

Commit

Permalink
Support AutoRound quantization method for intel GPU (#1428)
Browse files Browse the repository at this point in the history
Co-authored-by: kevinintel <hanwen.chang@intel.com>
Co-authored-by: Wenxin Zhang <wenxin.zhang@intel.com>
Co-authored-by: changwangss <chang1.wang@intel.com>
  • Loading branch information
4 people committed Apr 2, 2024
1 parent ab2fd05 commit 7084e7f
Show file tree
Hide file tree
Showing 14 changed files with 615 additions and 169 deletions.
7 changes: 5 additions & 2 deletions docs/weightonlyquant.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ As large language models (LLMs) become more prevalent, there is a growing need f
| Support Device | Rtn | Awq | Teq | GPTQ | AutoRound |
|:--------------:|:----------:|:----------:|:----------:|:----:|:----:|
| Intel CPU | &#10004; | &#10004; | &#10004; | &#10004; | &#10004; |
| Intel GPU | &#10004; | stay tuned | stay tuned | stay tuned | stay tuned |
| Intel GPU | &#10004; | stay tuned | stay tuned | &#10004; | &#10004; |

**RTN**[[1\]](https://github.com/intel/intel-extension-for-transformers/blob/548c13ed2e19cde91729530ca26c3b875c1b3d10/docs/weightonlyquant.md#1)(&#9733;&#9733;&#9733;): Rounding to Nearest (RTN) is an intuitively simple method that rounds values to the nearest integer. It boasts simplicity, requiring no additional datasets, and offers fast quantization. Besides, it could be easily applied in other datatype like NF4(non-uniform). Typically, it performs well on configurations such as W4G32 or W8, but worse than advanced algorithms at lower precision level.

Expand Down Expand Up @@ -147,7 +147,10 @@ loaded_model = AutoModelForCausalLM.from_pretrained(saved_dir)
> Note: For LLM runtime model loading usage, please refer to [neural_speed readme](https://github.com/intel/neural-speed/blob/main/README.md#quick-start-transformer-like-usage)
## Examples For Intel GPU
Intel-extension-for-transformers implement weight-only quantization for intel GPU(PVC and ARC) with [Intel-extension-for-pytorch](https://github.com/intel/intel-extension-for-pytorch). Currently, the Linear op kernel of Weight-only quantization is implemented in the Intel-extension-for-pytorch branch: "dev/QLLM".
Intel-extension-for-transformers implement weight-only quantization for intel GPU(PVC and ARC) with [Intel-extension-for-pytorch](https://github.com/intel/intel-extension-for-pytorch). Currently, the Linear op kernel of Weight-only quantization is implemented in the Intel-extension-for-pytorch branch: "dev/QLLM".

Now 4-bit/8-bit inference with `RtnConfig`, `AwqConfig`, `GPTQConfig`, `AutoRoundConfig` are support on intel GPU device.

We support experimental woq inference on intel GPU(PVC and ARC) with replacing Linear op in PyTorch. Validated models: Qwen-7B, GPT-J-6B.
Here are the example codes.

Expand Down
224 changes: 224 additions & 0 deletions examples/.config/pytorch_optimize.json
Original file line number Diff line number Diff line change
Expand Up @@ -1576,6 +1576,230 @@
}
}
},
"mistral_7b_autoround_neuralspeed": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{
"cmd": "bash run_tuning.sh",
"params": {
"topology": "mistral_7b_int4_autoround",
"task": "generation",
"backend": "neuralspeed",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_autoround",
"task": "generation",
"backend": "neuralspeed",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},
"mistral_7b_gptq_neuralspeed": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{
"cmd": "bash run_tuning.sh",
"params": {
"topology": "mistral_7b_int4_gptq",
"task": "generation",
"backend": "neuralspeed",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_gptq",
"task": "generation",
"mode": "benchmark",
"backend": "neuralspeed",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},

"mistral_7b_rtn_neuralspeed": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{
"cmd": "bash run_tuning.sh",
"params": {
"topology": "mistral_7b_int4_rtn",
"task": "generation",
"backend": "neuralspeed",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_rtn",
"task": "generation",
"backend": "neuralspeed",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},
"mistral_7b_autoround": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{
"cmd": "bash run_tuning.sh",
"params": {
"topology": "mistral_7b_int4_autoround",
"task": "generation",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_autoround",
"task": "generation",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},
"mistral_7b_gptq": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{
"cmd": "bash run_tuning.sh",
"params": {
"topology": "mistral_7b_int4_gptq",
"task": "generation",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_gptq",
"task": "generation",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},

"mistral_7b_rtn": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{
"cmd": "bash run_tuning.sh",
"params": {
"topology": "mistral_7b_int4_rtn",
"task": "generation",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_rtn",
"task": "generation",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},
"mistral_7b_autoround_neuralspeed_hf": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{
"cmd": "bash run_tuning.sh",
"params": {
"topology": "mistral_7b_int4_autoround",
"task": "generation",
"backend": "neuralspeed",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_autoround",
"task": "generation",
"backend": "neuralspeed",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results",
"model_source": "huggingface"
}
}
},
"mistral_7b_gptq_neuralspeed_hf": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_gptq",
"task": "generation",
"mode": "benchmark",
"backend": "neuralspeed",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results",
"model_source": "huggingface"
}
}
},
"mistral_7b_autoround_hf": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_autoround",
"task": "generation",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results",
"model_source": "huggingface"
}
}
},
"mistral_7b_gptq_hf": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "mistral_7b_int4_gptq",
"task": "generation",
"mode": "benchmark",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results",
"model_source": "huggingface"
}
}
},
"dolly_v2_3b_gen_ipex_static": {
"working_dir": "huggingface/pytorch/text-generation/quantization",
"tune":{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ python run_generation.py \
>**Note**:
> 1. default search algorithm is beam search with num_beams = 1.
> 2. [ipex.optimize_transformers](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.10%2Bxpu/docs/tutorials/llm/llm_optimize_transformers.md) Support for the optimized inference of model types "gptj," "mistral," "qwen," and "llama" to achieve high performance and accuracy. Ensure accurate inference for other model types as well.
> 3. We provide compression technologies `WeightOnlyQuant` with `Rtn/GPTQ/AutoRound` algorithms and `load_in_4bit` and `load_in_8bit` work on intel GPU device.
## Prerequisite​
### Dependencies
Intel-extension-for-pytorch dependencies are in oneapi package, before install intel-extension-for-pytorch, we should install oneapi first. Please refer to [Installation Guide](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.10%2Bxpu) to install the OneAPI to "/opt/intel folder".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ bitsandbytes #baichuan
transformers_stream_generator
tiktoken #qwen
einops #qwen
neural-speed
auto-round
git+https://github.com/intel/neural-compressor.git
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ protobuf
sentencepiece != 0.1.92
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
torch==2.1.0a0
transformers
transformers==4.35.2
optimum-intel
bitsandbytes #baichuan
transformers_stream_generator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ function init_params {
--backend=*)
backend=$(echo $var |cut -f2 -d=)
;;
--model_source=*)
model_source=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
Expand Down Expand Up @@ -150,10 +153,16 @@ function run_benchmark {
model_name_or_path="Intel/neural-chat-7b-v3"
elif [ "${topology}" = "phi_1b" ]; then
model_name_or_path="susnato/phi-1_dev"
pip install transformers==4.36.1
pip install transformers==4.36.1
elif [ "${topology}" = "phi_1_5b" ]; then
model_name_or_path="susnato/phi-1_5_dev"
pip install transformers==4.36.1
pip install transformers==4.36.1
elif [ "${topology}" = "llama2_7b_int4_gptq" ] && [ "$model_source" != "huggingface" ]; then
model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
elif [ "${topology}" = "mistral_7b_int4_autoround" ] && [ "$model_source" != "huggingface" ]; then
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
elif [ "${topology}" = "mistral_7b_int4_rtn" ] && [ "$model_source" != "huggingface" ]; then
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
fi

if [[ ${int8} == "true" ]]; then
Expand All @@ -168,9 +177,51 @@ function run_benchmark {
elif [ "${topology}" = "gpt_j_mp" ]; then
extra_cmd=$extra_cmd" --mixed_precision"
elif [ "${topology}" = "llama2_7b_int4_gptq" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --woq --bits 4 --weight_dtype int4_clip --compute_dtype fp32 --scheme asym "
extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --max_input_length 2048 "
if [[ "$model_source" == "huggingface" ]]; then
model_name_or_path="TheBloke/Llama-2-7B-Chat-GPTQ"
else
model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --woq_loading"
fi
if [[ $backend == "neuralspeed" ]]; then
extra_cmd=$extra_cmd" --use_neural_speed"
fi
elif [ "${topology}" = "mistral_7b_int4_autoround" ]; then
if [[ "$model_source" == "huggingface" ]]; then
model_name_or_path="Intel/Mistral-7B-v0.1-int4-inc"
else
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --woq_loading"
fi
if [[ $backend == "neuralspeed" ]]; then
extra_cmd=$extra_cmd" --use_neural_speed"
fi

elif [ "${topology}" = "mistral_7b_int4_rtn" ]; then
if [[ "$model_source" == "huggingface" ]]; then
model_name_or_path="mistralai/Mistral-7B-v0.1"
else
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --woq_loading"
fi
if [[ $backend == "neuralspeed" ]]; then
extra_cmd=$extra_cmd" --use_neural_speed"
fi

elif [ "${topology}" = "mistral_7b_int4_gptq" ]; then
if [[ "$model_source" == "huggingface" ]]; then
model_name_or_path="TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
else
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
extra_cmd=$extra_cmd" --trust_remote_code"
extra_cmd=$extra_cmd" --woq_loading"
fi
if [[ $backend == "neuralspeed" ]]; then
extra_cmd=$extra_cmd" --use_neural_speed"
fi
else
extra_cmd=$extra_cmd" --int8"
fi
Expand Down
Loading

0 comments on commit 7084e7f

Please sign in to comment.