[NeuralChat] Support GGUF model in NeuralChat (#1200)

* Support GGUF model in NeuralChat Signed-off-by: lvliang-intel <liang1.lv@intel.com>
intel · Feb 8, 2024 · a53a33c · a53a33c
1 parent 3068496
commit a53a33c
Show file tree

Hide file tree

Showing 9 changed files with 209 additions and 20 deletions.
diff --git a/intel_extension_for_transformers/neural_chat/chatbot.py b/intel_extension_for_transformers/neural_chat/chatbot.py
@@ -273,6 +273,7 @@ def build_chatbot(config: PipelineConfig=None):
     parameters["peft_path"] = config.loading_config.peft_path
     parameters["use_deepspeed"] = config.loading_config.use_deepspeed
     parameters["use_llm_runtime"] = config.loading_config.use_llm_runtime
+    parameters["gguf_model_path"] = config.loading_config.gguf_model_path
     parameters["optimization_config"] = config.optimization_config
     parameters["hf_access_token"] = config.hf_access_token
     parameters["assistant_model"] = config.assistant_model

diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
@@ -421,6 +421,7 @@ class LoadingModelConfig:
     world_size: int = 1
     ipex_int8: bool = False
     use_llm_runtime: bool = False
+    gguf_model_path: str = None
 
 @dataclass
 class FrameworkConfig:

diff --git a/..._transformers/neural_chat/examples/deployment/codegen/backend/pc/gguf/README.md b/..._transformers/neural_chat/examples/deployment/codegen/backend/pc/gguf/README.md
@@ -0,0 +1,64 @@
+This README is designed to walk you through setting up the backend for a code-generating chatbot using the NeuralChat framework. You can deploy this chatbot on various platforms, including Intel XEON Scalable Processors, Habana's Gaudi processors (HPU), Intel Data Center GPU and Client GPU, Nvidia Data Center GPU, and Client GPU.
+
+This code-generating chatbot demonstrates how to deploy it specifically on a Laptop PC. To ensure smooth operation on a laptop, we need to implement [LLM runtime optimization](../../../../../../llm/runtime/graph/README.md) to accelerate the inference process.
+
+# Setup Conda
+
+First, you need to install and configure the Conda environment:
+
+Visit the [Miniconda download page](https://docs.conda.io/projects/miniconda/en/latest/) and download the installer suitable for your Windows system.
+Locate the downloaded installer file (e.g., Miniconda3-latest-Windows-x86_64.exe for Miniconda). Double-click the installer to launch it. 
+To create a new Conda environment, use the command: "conda create -n myenv python=3.9.0"
+
+# Install visual cpp build tools
+
+Visual C++ Build Tools is a package provided by Microsoft that includes tools required to build C++ projects using Visual Studio without installing the full Visual Studio IDE. These tools are essential for compiling, linking, and building intel extension for transformers.
+
+To install the Visual C++ Build Tools, visit the following link: [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/).
+Once there, you'll find download options and instructions for installation based on your specific requirements.
+
+# Install intel extension for transformers
+
+Install the intel extension for transformers from source code to get the latest features of LLM runtime.
+
+```bash
+pip clone https://github.com/intel/intel-extension-for-transformers.git
+cd intel-extension-for-transformers
+pip install -r requirements.txt
+pip install -e .
+```
+
+# Install Python dependencies
+
+Install dependencies using pip
+
+```bash
+pip install ../../../../../requirements_pc.txt
+pip install transformers==4.35.2
+```
+
+# Configure the codegen.yaml
+
+You can customize the configuration file 'codegen.yaml' to match your environment setup. Here's a table to help you understand the configurable options:
+
+| Item               | Value                                |
+| ------------------ | -------------------------------------|
+| host               | 127.0.0.1                            |
+| port               | 8000                                 |
+| model_name_or_path | "codellama/CodeLlama-7b-hf"          |
+| device             | "cpu"                                |
+| tasks_list         | ['textchat']                         |
+| optimization       |                                      |
+|                    |  use_llm_runtime  | true             |
+|                    |  optimization_type| "weight_only"    |
+|                    |  compute_dtype    | "int8"           |
+|                    |  weight_dtype     | "int4"           |
+
+
+
+# Run the Code Generation Chatbot server
+To start the code-generating chatbot server, use the following command:
+
+```shell
+nohup python run_code_gen.py &
+```
diff --git a/...ion_for_transformers/neural_chat/examples/deployment/codegen/backend/pc/gguf/codegen.yaml b/...ion_for_transformers/neural_chat/examples/deployment/codegen/backend/pc/gguf/codegen.yaml
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the parameter configuration file for NeuralChat Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8000
+
+# if you want to run "codellama/CodeLlama-7b-hf", please download it to local and pass the local path.
+# model_name_or_path: "TheBloke/Magicoder-S-DS-6.7B-GGUF"
+# tokenizer_name_or_path: "ise-uiuc/Magicoder-S-DS-6.7B"
+# gguf_model_path: "magicoder-s-ds-6.7b.Q4_0.gguf"
+model_name_or_path: "TheBloke/Llama-2-7B-Chat-GGUF"
+tokenizer_name_or_path: "meta-llama/Llama-2-7b-chat-hf"
+gguf_model_path: "llama-2-7b-chat.Q4_0.gguf"
+device: "cpu"
+
+# task choices = ['textchat', 'voicechat', 'retrieval', 'text2image', 'finetune', 'codegen']
+tasks_list: ['codegen']
diff --git a/..._for_transformers/neural_chat/examples/deployment/codegen/backend/pc/gguf/run_code_gen.py b/..._for_transformers/neural_chat/examples/deployment/codegen/backend/pc/gguf/run_code_gen.py
@@ -0,0 +1,26 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor
+
+def main():
+    server_executor = NeuralChatServerExecutor()
+    server_executor(config_file="./codegen.yaml", log_file="./codegen.log")
+
+if __name__ == "__main__":
+    main()
diff --git a/intel_extension_for_transformers/neural_chat/models/base_model.py b/intel_extension_for_transformers/neural_chat/models/base_model.py
@@ -137,7 +137,8 @@ def load_model(self, kwargs: dict):
                    use_llm_runtime=kwargs["use_llm_runtime"],
                    assistant_model=kwargs["assistant_model"],
                    use_vllm=kwargs["use_vllm"],
-                   vllm_engine_params=kwargs["vllm_engine_params"])
+                   vllm_engine_params=kwargs["vllm_engine_params"],
+                   gguf_model_path=kwargs["gguf_model_path"])
 
     def predict_stream(self, query, origin_query="", config=None):
         """

diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -407,6 +407,7 @@ def load_model(
     assistant_model=None,
     use_vllm=False,
     vllm_engine_params=None,
+    gguf_model_path=None,
 ):
     """
     Load the model and initialize the tokenizer.
@@ -483,6 +484,7 @@ def load_model(
     # load assistant model
     if assistant_model:
         print("Loading assistant model...")
+        from transformers import AutoModelForCausalLM
         assistant_model_class = AutoModelForCausalLM
         print(f"Loading assistant model via {assistant_model_class}")
         assis_model = assistant_model_class.from_pretrained(
@@ -528,7 +530,7 @@ def load_model(
                 or re.search("neural-chat-7b-v2", model_name, re.IGNORECASE)) else True,
             use_auth_token=hf_access_token,
             trust_remote_code=True if (re.search("qwen", model_name, re.IGNORECASE) or \
-                re.search("chatglm", model_name, re.IGNORECASE)) else False,
+                re.search("chatglm", model_name, re.IGNORECASE) or gguf_model_path) else False,
         )
     except EnvironmentError as e:
         logging.error(f"Exception: {e}")
@@ -561,7 +563,18 @@ def load_model(
         logging.info("Optimized Model loaded.")
         return
 
+    if gguf_model_path:
+        from intel_extension_for_transformers.transformers import AutoModelForCausalLM
+        model = AutoModelForCausalLM.from_pretrained(model_name, model_file = gguf_model_path)
+        if tokenizer.pad_token is None and tokenizer.pad_token_id is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        MODELS[model_name]["model"] = model
+        MODELS[model_name]["tokenizer"] = tokenizer
+        logging.info("GGUF Model loaded.")
+        return
+
     try:
+        from transformers import AutoModelForCausalLM
         if device == "hpu" and use_deepspeed and load_to_meta:
             with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
                 model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
@@ -1033,14 +1046,15 @@ def predict_stream(**params):
             tokenizer, skip_prompt=True, skip_special_tokens=True
         )
 
-    context_len = get_context_length(model.config)
-    length = min(max_new_tokens, context_len - input_token_len)
-    if length <= 0:
-        logging.error(f"This model's maximum context length is {context_len} tokens. \
-            However, your messages resulted in {input_token_len} tokens. Please reduce the length of the messages.",
-        )
-        set_latest_error(ErrorCodes.WARNING_INPUT_EXCEED_MAX_SEQ_LENGTH)
-        return
+    if "gguf" not in model_name.lower():
+        context_len = get_context_length(model.config)
+        length = min(max_new_tokens, context_len - input_token_len)
+        if length <= 0:
+            logging.error(f"This model's maximum context length is {context_len} tokens. \
+                However, your messages resulted in {input_token_len} tokens. Please reduce the length of the messages.",
+            )
+            set_latest_error(ErrorCodes.WARNING_INPUT_EXCEED_MAX_SEQ_LENGTH)
+            return
 
     generate_kwargs = get_generate_kwargs(
         max_new_tokens, input_token_len,
@@ -1326,22 +1340,22 @@ def predict(**params):
                                             "codellama" in model_name.lower() or \
                                             "starcoder" in model_name.lower() or \
                                             "codegen" in model_name.lower()) else 1024
-
     input_tokens, input_token_len = tokenization(prompt, tokenizer, device)
     generate_kwargs = get_generate_kwargs(
         max_new_tokens, input_token_len,
         get_stop_token_ids(model, tokenizer),
         assistant_model=assistant_model
     )
 
-    context_len = get_context_length(model.config)
-    length = min(max_new_tokens, context_len - input_token_len)
-    if length <= 0:
-        logging.error(f"This model's maximum context length is {context_len} tokens. \
-            However, your messages resulted in {input_token_len} tokens. Please reduce the length of the messages.",
-        )
-        set_latest_error(ErrorCodes.WARNING_INPUT_EXCEED_MAX_SEQ_LENGTH)
-        return
+    if "gguf" not in model_name.lower():
+        context_len = get_context_length(model.config)
+        length = min(max_new_tokens, context_len - input_token_len)
+        if length <= 0:
+            logging.error(f"This model's maximum context length is {context_len} tokens. \
+                However, your messages resulted in {input_token_len} tokens. Please reduce the length of the messages.",
+            )
+            set_latest_error(ErrorCodes.WARNING_INPUT_EXCEED_MAX_SEQ_LENGTH)
+            return
 
     if device in ["cpu", "cuda", "xpu"]:
         if device in ["cuda", "xpu"]:

diff --git a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py
@@ -106,6 +106,7 @@ def init(self, config):
         world_size = config.get("world_size", 1)
         master_port = config.get("master_port", 29500)
         model_name_or_path = config.get("model_name_or_path", "meta-llama/Llama-2-7b-hf")
+        gguf_model_path = config.get("gguf_model_path", None)
         tokenizer_name_or_path = config.get("tokenizer_name_or_path", model_name_or_path)
         peft_model_path = config.get("peft_model_path", "")
         plugin_as_service = config.get("plugin_as_service", False)
@@ -240,7 +241,7 @@ def init(self, config):
             bnb_4bit_compute_dtype = yaml_config.get("bnb_4bit_compute_dtype", {})
             loading_config = LoadingModelConfig(ipex_int8=ipex_int8, use_llm_runtime=use_llm_runtime,
                                                 peft_path=peft_model_path, use_deepspeed=use_deepspeed,
-                                                world_size=world_size)
+                                                world_size=world_size, gguf_model_path=gguf_model_path)
             from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig, MixedPrecisionConfig
             if optimization_type == "weight_only":
                 if use_gptq:

diff --git a/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_gguf.py b/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_gguf.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig
+from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig
+from intel_extension_for_transformers.neural_chat.utils.common import get_device_type
+import unittest
+
+class TestLlama2GGUFModel(unittest.TestCase):
+    def setUp(self):
+        self.device = get_device_type()
+        return super().setUp()
+
+    def tearDown(self) -> None:
+        return super().tearDown()
+
+    def test_code_gen_with_gguf(self):
+        if self.device == "hpu":
+            self.skipTest("GGUF is not supported on HPU.")
+
+        loading_config = LoadingModelConfig(gguf_model_path="llama-2-7b-chat.Q4_0.gguf")
+        config = PipelineConfig(model_name_or_path="TheBloke/Llama-2-7B-Chat-GGUF",
+                                tokenizer_name_or_path="meta-llama/Llama-2-7b-chat-hf",
+                                loading_config=loading_config)
+        chatbot = build_chatbot(config=config)
+        result = chatbot.predict("Tell me about Intel Xeon Scalable Processors.")
+        print(result)
+        self.assertIn('Intel Xeon Scalable Processors', str(result))
+
+if __name__ == "__main__":
+    unittest.main()