[NeuralChat] Support GPTQ, AWQ model in NeuralChat (#1206)

intel · Feb 1, 2024 · 5b08dee · 5b08dee
1 parent 1c8078f
commit 5b08dee
Show file tree

Hide file tree

Showing 9 changed files with 185 additions and 6 deletions.
diff --git a/...s/deployment/codegen/backend/pc/README.md → ...loyment/codegen/backend/pc/gptq/README.md b/...s/deployment/codegen/backend/pc/README.md → ...loyment/codegen/backend/pc/gptq/README.md
diff --git a/...ion_for_transformers/neural_chat/examples/deployment/codegen/backend/pc/gptq/codegen.yaml b/...ion_for_transformers/neural_chat/examples/deployment/codegen/backend/pc/gptq/codegen.yaml
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the parameter configuration file for NeuralChat Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8000
+
+# Download Hugging Face GPTQ model to local path.
+model_name_or_path: "./Magicoder-S-DS-6.7B-GPTQ"
+device: "cpu"
+use_llm_runtime: true
+use_gptq: true
+
+# task choices = ['textchat', 'voicechat', 'retrieval', 'text2image', 'finetune', 'codegen']
+tasks_list: ['codegen']
diff --git a/...oyment/codegen/backend/pc/run_code_gen.py → ...t/codegen/backend/pc/gptq/run_code_gen.py b/...oyment/codegen/backend/pc/run_code_gen.py → ...t/codegen/backend/pc/gptq/run_code_gen.py
diff --git a/...r_transformers/neural_chat/examples/deployment/codegen/backend/pc/woq/README.md b/...r_transformers/neural_chat/examples/deployment/codegen/backend/pc/woq/README.md
@@ -0,0 +1,64 @@
+This README is designed to walk you through setting up the backend for a code-generating chatbot using the NeuralChat framework. You can deploy this chatbot on various platforms, including Intel XEON Scalable Processors, Habana's Gaudi processors (HPU), Intel Data Center GPU and Client GPU, Nvidia Data Center GPU, and Client GPU.
+
+This code-generating chatbot demonstrates how to deploy it specifically on a Laptop PC. To ensure smooth operation on a laptop, we need to implement [LLM runtime optimization](../../../../../../llm/runtime/graph/README.md) to accelerate the inference process.
+
+# Setup Conda
+
+First, you need to install and configure the Conda environment:
+
+Visit the [Miniconda download page](https://docs.conda.io/projects/miniconda/en/latest/) and download the installer suitable for your Windows system.
+Locate the downloaded installer file (e.g., Miniconda3-latest-Windows-x86_64.exe for Miniconda). Double-click the installer to launch it. 
+To create a new Conda environment, use the command: "conda create -n myenv python=3.9.0"
+
+# Install visual cpp build tools
+
+Visual C++ Build Tools is a package provided by Microsoft that includes tools required to build C++ projects using Visual Studio without installing the full Visual Studio IDE. These tools are essential for compiling, linking, and building intel extension for transformers.
+
+To install the Visual C++ Build Tools, visit the following link: [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/).
+Once there, you'll find download options and instructions for installation based on your specific requirements.
+
+# Install intel extension for transformers
+
+Install the intel extension for transformers from source code to get the latest features of LLM runtime.
+
+```bash
+pip clone https://github.com/intel/intel-extension-for-transformers.git
+cd intel-extension-for-transformers
+pip install -r requirements.txt
+pip install -e .
+```
+
+# Install Python dependencies
+
+Install dependencies using pip
+
+```bash
+pip install ../../../../../requirements_pc.txt
+pip install transformers==4.35.2
+```
+
+# Configure the codegen.yaml
+
+You can customize the configuration file 'codegen.yaml' to match your environment setup. Here's a table to help you understand the configurable options:
+
+| Item               | Value                                |
+| ------------------ | -------------------------------------|
+| host               | 127.0.0.1                            |
+| port               | 8000                                 |
+| model_name_or_path | "codellama/CodeLlama-7b-hf"          |
+| device             | "cpu"                                |
+| tasks_list         | ['textchat']                         |
+| optimization       |                                      |
+|                    |  use_llm_runtime  | true             |
+|                    |  optimization_type| "weight_only"    |
+|                    |  compute_dtype    | "int8"           |
+|                    |  weight_dtype     | "int4"           |
+
+
+
+# Run the Code Generation Chatbot server
+To start the code-generating chatbot server, use the following command:
+
+```shell
+nohup python run_code_gen.py &
+```
diff --git a/...eployment/codegen/backend/pc/codegen.yaml → ...yment/codegen/backend/pc/woq/codegen.yaml b/...eployment/codegen/backend/pc/codegen.yaml → ...yment/codegen/backend/pc/woq/codegen.yaml
@@ -24,7 +24,7 @@ host: 0.0.0.0
 port: 8000
 
 # if you want to run "codellama/CodeLlama-7b-hf", please download it to local and pass the local path.
-model_name_or_path: "bigcode/starcoderbase-1b"
+model_name_or_path: "ise-uiuc/Magicoder-S-DS-6.7B"
 device: "cpu"
 
 # itrex int4 llm runtime optimization

diff --git a/...n_for_transformers/neural_chat/examples/deployment/codegen/backend/pc/woq/run_code_gen.py b/...n_for_transformers/neural_chat/examples/deployment/codegen/backend/pc/woq/run_code_gen.py
@@ -0,0 +1,26 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor
+
+def main():
+    server_executor = NeuralChatServerExecutor()
+    server_executor(config_file="./codegen.yaml", log_file="./codegen.log")
+
+if __name__ == "__main__":
+    main()
diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -550,8 +550,9 @@ def load_model(
         else:
             optimization_config.post_init()
         model = optimize_model(model_name, optimization_config, use_llm_runtime)
-        if not model.config.is_encoder_decoder:
-            tokenizer.padding_side = "left"
+        if hasattr(model, 'config'):
+            if model.config.is_encoder_decoder:
+                tokenizer.padding_side = "left"
         if tokenizer.pad_token is None and tokenizer.pad_token_id is None:
             tokenizer.pad_token = tokenizer.eos_token
         MODELS[model_name]["model"] = model

diff --git a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py
@@ -224,7 +224,10 @@ def init(self, config):
             optimization_config = None
             yaml_config = config.get("optimization", {})
             ipex_int8 = yaml_config.get("ipex_int8", False)
-            use_llm_runtime = yaml_config.get("use_llm_runtime", {})
+            use_llm_runtime = yaml_config.get("use_llm_runtime", False)
+            use_gptq = yaml_config.get("use_gptq", False)
+            use_awq = yaml_config.get("use_awq", False)
+            use_autoround = yaml_config.get("use_autoround", {})
             optimization_type = yaml_config.get("optimization_type", {})
             compute_dtype = yaml_config.get("compute_dtype", {})
             weight_dtype = yaml_config.get("weight_dtype", {})
@@ -240,8 +243,15 @@ def init(self, config):
                                                 world_size=world_size)
             from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig, MixedPrecisionConfig
             if optimization_type == "weight_only":
-                optimization_config = WeightOnlyQuantConfig(compute_dtype=compute_dtype, weight_dtype=weight_dtype,
-                                                            use_ggml=use_ggml, use_cache=use_cached_bin)
+                if use_gptq:
+                    optimization_config = WeightOnlyQuantConfig(use_gptq=use_gptq)
+                elif use_awq:
+                    optimization_config = WeightOnlyQuantConfig(use_gptq=use_awq)
+                elif use_autoround:
+                    optimization_config = WeightOnlyQuantConfig(use_gptq=use_autoround)
+                else:
+                    optimization_config = WeightOnlyQuantConfig(compute_dtype=compute_dtype, weight_dtype=weight_dtype,
+                                                                use_ggml=use_ggml, use_cache=use_cached_bin)
             elif optimization_type == "mix_precision":
                 optimization_config = MixedPrecisionConfig(dtype=mix_precision_dtype)
             elif optimization_type == "bits_and_bytes":

diff --git a/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_gptq.py b/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_gptq.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig
+from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig
+from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig
+from intel_extension_for_transformers.neural_chat.utils.common import get_device_type
+import unittest
+
+class TestLlama2GPTQModel(unittest.TestCase):
+    def setUp(self):
+        self.device = get_device_type()
+        return super().setUp()
+
+    def tearDown(self) -> None:
+        return super().tearDown()
+
+    def test_code_gen_with_gguf(self):
+        if self.device == "hpu":
+            self.skipTest("GTPQ is not supported on HPU.")
+        loading_config = LoadingModelConfig(use_llm_runtime=True)
+        optimization_config = WeightOnlyQuantConfig(use_gptq=True)
+        config = PipelineConfig(model_name_or_path="/tf_dataset2/models/nlp_toolkit/Llama-2-7B-Chat-GPTQ",
+                                optimization_config=optimization_config,
+                                loading_config=loading_config)
+        chatbot = build_chatbot(config=config)
+        result = chatbot.predict("Tell me about Intel Xeon Scalable Processors.")
+        print(result)
+        self.assertIn('Intel Xeon Scalable Processors', str(result))
+
+if __name__ == "__main__":
+    unittest.main()