Python api for cpp model (#252)

intel · Sep 20, 2023 · be651be · be651be
1 parent 2ab0e3b
commit be651be
Show file tree

Hide file tree

Showing 20 changed files with 624 additions and 80 deletions.
diff --git a/.github/workflows/docker/codeScan.dockerfile b/.github/workflows/docker/codeScan.dockerfile
@@ -33,7 +33,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
 
 RUN ln -sf $(which python3) /usr/bin/python
 
-RUN python -m pip install --no-cache-dir pylint==2.12.1\
+RUN python -m pip install --no-cache-dir pylint==2.17.5\
     bandit==1.7.4\
     pyspelling\
     pydocstyle

diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
@@ -40,6 +40,7 @@ python -m pylint -f json --disable=R,C,W,E1129 \
     --extension-pkg-whitelist=numpy,nltk \
     --ignored-classes=TensorProto,NodeProto \
     --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py \
+    --ignore-paths=/intel-extension-for-transformers/intel_extension_for_transformers/llm/runtime/graph/ \
     /intel-extension-for-transformers/intel_extension_for_transformers >${log_dir}/pylint.json
 exit_code=$?
 

diff --git a/.github/workflows/script/unitTest/coverage/.engine-coveragerc b/.github/workflows/script/unitTest/coverage/.engine-coveragerc
@@ -10,5 +10,6 @@ omit =
  */intel_extension_for_transformers/llm/finetuning/**
  */intel_extension_for_transformers/llm/inference/**
  */intel_extension_for_transformers/llm/quantization/**
+ */intel_extension_for_transformers/llm/runtime/graph/**
 exclude_lines =
  pragma: no cover
diff --git a/.github/workflows/script/unitTest/coverage/.optimize-coveragerc b/.github/workflows/script/unitTest/coverage/.optimize-coveragerc
@@ -11,6 +11,7 @@ omit =
  */intel_extension_for_transformers/llm/library/**
  */intel_extension_for_transformers/llm/operator/**
  */intel_extension_for_transformers/llm/runtime/deprecated/**
+ */intel_extension_for_transformers/llm/runtime/graph/**
  */intel_extension_for_transformers/neural_chat/**
  */intel_extension_for_transformers/transformers/modeling/**
  */intel_extension_for_transformers/transformers/utils/get_throughput.py

diff --git a/.gitmodules b/.gitmodules
@@ -79,3 +79,6 @@
 [submodule "intel_extension_for_transformers/llm/runtime/third_party/xbyak"]
 	path = intel_extension_for_transformers/llm/runtime/deprecated/third_party/xbyak
 	url = https://github.com/herumi/xbyak.git
+[submodule "intel_extension_for_transformers/llm/runtime/graph/application/third_party/pybind11"]
+	path = intel_extension_for_transformers/llm/runtime/graph/application/third_party/pybind11
+	url = https://github.com/pybind/pybind11.git
diff --git a/intel_extension_for_transformers/llm/runtime/__init__.py b/intel_extension_for_transformers/llm/runtime/__init__.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2021 Intel Corporation
+# Copyright (c) 2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/...tension_for_transformers/llm/runtime/deprecated/compile/sub_graph/textEncoder_QReshape.py b/...tension_for_transformers/llm/runtime/deprecated/compile/sub_graph/textEncoder_QReshape.py
@@ -82,6 +82,7 @@ def __call__(self, model):
 
 
                     head_num_idx = False
+                    head_num = 0
                     if pack_node.input_tensors[1].data != None:
                         head_num_idx = 1
                         head_num = int(pack_node.input_tensors[1].data)

diff --git a/intel_extension_for_transformers/llm/runtime/graph/README.md b/intel_extension_for_transformers/llm/runtime/graph/README.md
@@ -52,29 +52,67 @@ cmake ..
 cmake --build . -j
 ```
 
-### 2. Convert LLM
+### 2. Run LLM with Python API
+
+You can use the python api to simplely run HF model.
+```python
+from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig
+model_name = "EleutherAI/gpt-j-6b"     # support model id of HF or local PATH to model
+woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
+model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_llm_runtime=True)
+prompt = "Once upon a time, a little girl"
+output = model.generate(prompt, streamer, max_new_tokens=30)
+```
+
+### 3. Run LLM with Script
+You can use the following script to run, including convertion, quantization and inference.
+```
+python scripts/run_llm.py model-path --weight_dtype int4 -p "She opened the door and see"
+```
+
+LLM one-click running script args explanations:
+| arg               | explanation                                                             |
+| --------------    | ----------------------------------------------------------------------- |
+| model           | directory containing model file or model id                 |
+| --weight_dtype  | data type of quantized weight (default: int4)         |
+| --alg           | quantization algorithm to use: sym/asym (default: sym)      |
+| --block_size    | block size (default: 32)                                    |
+| --scale_dtype   | fp32/bf16 type for scales (default: fp32)                   |
+| --compute_type  | Gemm computation data type: int8/fp32/ggml (default: ggml)  |
+| -p / --prompt     | prompt to start generation with (default: empty)                        |
+| -n / --n_predict  | number of tokens to predict (default: -1, -1 = infinity)                |
+| -t / --threads    | number of threads to use during computation (default: 56)               |
+| -b / --batch_size | batch size for prompt processing (default: 512)                         |
+| -c / --ctx_size   | size of the prompt context (default: 512, can not be larger than specific model's context window length)                                                                                |
+| -s / --seed       | NG seed (default: -1, use random seed for < 0)                          |
+| --repeat_penalty  | penalize repeat sequence of tokens (default: 1.1, 1.0 = disabled)       |
+| --color           | colorise output to distinguish prompt and user input from generations   |
+| --keep            | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
+
+
+## Advanced use
+
+### 1. Convert and Quantize LLM model
 LLM Runtime assumes the same model format as [llama.cpp](https://github.com/ggerganov/llama.cpp) and [ggml](https://github.com/ggerganov/ggml). You can also convert the model by following the below steps:
 
 ```bash
-# download fp32 model (e.g., LLAMA2) from Hugging Face
-git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
 
-# convert the pytorch model to ggml format
-python scripts/convert_model.py --outtype f32 --outfile ne-f32.bin model_path
+# convert the model directly use model id in Hugging Face. (recommended)
+python scripts/convert.py --outtype f32 --outfile ne-f32.bin EleutherAI/gpt-j-6b
 
-# or convert the model without downloading it by hand (llama and llama2 are WIP) 
-python scripts/convert_model.py --outtype f32 --outfile EleutherAI/gpt-j-6b
+# or you can download fp32 model (e.g., LLAMA2) from Hugging Face at first, then convert the pytorch model to ggml format.
+git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+python scripts/convert.py --outtype f32 --outfile ne-f32.bin model_path
 
 # quantize weights of fp32 ggml bin
 # model_name: llama, llama2, mpt, falcon, gptj, starcoder, dolly
 # to neuarl engine graph optimized q4_j with 128 block_size format (recommended)
-python scripts/quant_bin.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 128 --compute_type int8
-
-# to ggml q4_0 format
-python scripts/quant_bin.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_0.bin --weight_dtype int4
-# to neuarl engine graph optimized q4_j with 32 block_size format
+python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 128 --compute_type int8
 
-python scripts/quant_bin.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 32 --compute_type int8
+# Alternativly you could run ggml q4_0 format like following
+python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_0.bin --weight_dtype int4
+# or ues neuarl engine graph optimized q4_j with 32 block_size format
+python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 32 --compute_type int8
 
 ```
 quantization args explanations:
@@ -90,22 +128,20 @@ quantization args explanations:
 | --scale_dtype   | fp32/bf16 type for scales (default: fp32)                   |
 | --compute_type  | Gemm computation data type: int8/fp32/ggml (default: ggml)  |
 
+### 2. Inference model with C++ script API
 
-### 3. Run Models
-
-We supply LLM running python script to run supported models conveniently.
-
+We supply LLM running script to run supported models with c++ api conveniently.
 ```bash
 # recommed to use numactl to bind cores in Intel cpus for better performance
 # if you use different core numbers, please also  change -t arg value
 # please type prompt about codes when run `StarCoder`, for example, -p "def fibonnaci(".
-OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/run_llm.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see"
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see"
 
 # if you want to generate fixed outputs, please set --seed arg, for example:
-OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/run_llm.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see" --seed 12
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see" --seed 12
 
 # if you want to reduce repeated generated texts, please set --repeat_penalty (value > 1.0, default = 1.0), for example:
-OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/run_llm.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see" --repeat_penalty 1.2
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see" --repeat_penalty 1.2
 ```
 
 LLM running script args explanations:
@@ -125,33 +161,7 @@ LLM running script args explanations:
 | --glm_tokenizer   | the path of the chatglm tokenizer (default: THUDM/chatglm-6b)           |
 
 
-### 4. One-click Script 
-
-You can use the following script to run, including convertion, quantization and inference.
-```
-python scripts/one_click_run.py model-path --weight_dtype int4 -p "She opened the door and see"
-```
-
-LLM one-click running script args explanations:
-| arg               | explanation                                                             |
-| --------------    | ----------------------------------------------------------------------- |
-| model           | directory containing model file or model id                 |
-| --weight_dtype  | data type of quantized weight (default: int4)         |
-| --alg           | quantization algorithm to use: sym/asym (default: sym)      |
-| --block_size    | block size (default: 32)                                    |
-| --scale_dtype   | fp32/bf16 type for scales (default: fp32)                   |
-| --compute_type  | Gemm computation data type: int8/fp32/ggml (default: ggml)  |
-| -p / --prompt     | prompt to start generation with (default: empty)                        |
-| -n / --n_predict  | number of tokens to predict (default: -1, -1 = infinity)                |
-| -t / --threads    | number of threads to use during computation (default: 56)               |
-| -b / --batch_size | batch size for prompt processing (default: 512)                         |
-| -c / --ctx_size   | size of the prompt context (default: 512, can not be larger than specific model's context window length)                                                                                |
-| -s / --seed       | NG seed (default: -1, use random seed for < 0)                          |
-| --repeat_penalty  | penalize repeat sequence of tokens (default: 1.1, 1.0 = disabled)       |
-| --color           | colorise output to distinguish prompt and user input from generations   |
-| --keep            | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
-
-### 5. Tensor Parallelism cross nodes/sockets
+### 3. Tensor Parallelism cross nodes/sockets
 
 We support tensor parallelism strategy for distributed inference/training on multi-node and multi-socket.  You can refer to [tensor_parallelism.md](./tensor_parallelism.md) to enable this feature.
 
diff --git a/intel_extension_for_transformers/llm/runtime/graph/__init__.py b/intel_extension_for_transformers/llm/runtime/graph/__init__.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from transformers import AutoConfig
+from intel_extension_for_transformers.llm.runtime.graph.scripts.convert_model import convert_model
+
+model_maps = {"gpt_neox": "gptneox", "RefinedWebModel": "falcon"}
+
+class Model:
+    def __init__(self):
+        self.module = None
+        self.model = None
+        self.model_type = None
+        self.bin_file = None
+
+    def __import_package(self, model_name):
+        if self.module:
+            return
+        if model_name == "gptj":
+            import intel_extension_for_transformers.llm.runtime.graph.gptj_cpp as cpp_model
+        elif model_name == "falcon":
+            import intel_extension_for_transformers.llm.runtime.graph.falcon_cpp as cpp_model
+        elif model_name == "gptneox":
+            import intel_extension_for_transformers.llm.runtime.graph.gptneox_cpp as cpp_model
+        elif model_name == "dolly":
+            import intel_extension_for_transformers.llm.runtime.graph.dolly_cpp as cpp_model
+        elif model_name == "llama" or model_name == "llama2":
+            import intel_extension_for_transformers.llm.runtime.graph.llama_cpp as cpp_model
+        elif model_name == "mpt":
+            import intel_extension_for_transformers.llm.runtime.graph.mpt_cpp as cpp_model
+        elif model_name == "starcoder":
+            import intel_extension_for_transformers.llm.runtime.graph.starcoder_cpp as cpp_model
+        elif model_name == "opt":
+            import intel_extension_for_transformers.llm.runtime.graph.opt_cpp as cpp_model
+        elif model_name == "bloom":
+            import intel_extension_for_transformers.llm.runtime.graph.bloom_cpp as cpp_model
+        elif model_name == "chatglm2":
+            import intel_extension_for_transformers.llm.runtime.graph.chatglm2_cpp as cpp_model
+        else:
+            raise TypeError("Unspported model type {}!".format(model_name))
+        self.module = cpp_model
+
+    def init(self, model_name, **kwargs):
+        config = AutoConfig.from_pretrained(model_name)
+        model_type = model_maps.get(config.model_type, config.model_type)
+        self.__import_package(model_type)
+
+        # 1. convert model
+        fp32_bin = "ne_{}_f32.bin".format(model_type)
+        convert_model(model_name, fp32_bin, "f32")
+
+        # 2. quant model
+        quant_bin = "ne_{}_q.bin".format(model_type)
+        self.module.Model.quant_model(model_path = fp32_bin, out_path = quant_bin, **kwargs)
+
+        self.model_type = model_type
+        self.bin_file = quant_bin
+
+        # clean
+        os.remove(fp32_bin)
+
+
+    def init_from_bin(self, model_name, model_path, **kwargs):
+        self.__import_package(model_name)
+        self.model = self.module.Model()
+        self.model.init_model(model_path, **kwargs)
+
+    def quant_model(self, model_name, model_path, out_path, **kwargs):
+        self.__import_package(model_name)
+        self.module.Model.quant_model(model_path = model_path,
+                                    out_path = out_path, **kwargs)
+
+    def generate(self, prompt, streamer = None, sentence_mode = True, **kwargs):
+        # TODO support streamer
+        if self.model is None:
+            self.init_from_bin(self.model_type, self.bin_file, **kwargs)
+
+        out = self.model.generate(prompt = prompt, sentence_mode = sentence_mode)
+        return out
+
+    def is_token_end(self):
+        return self.model.is_token_end()
diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/CMakeLists.txt b/intel_extension_for_transformers/llm/runtime/graph/application/CMakeLists.txt
@@ -66,6 +66,21 @@ compile_quant(quant_chatglm   quant_model.cpp chatglm   chatglm)
 compile_quant(quant_chatglm2  quant_model.cpp chatglm2   chatglm2)
 
 # all models running
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_subdirectory(third_party/pybind11)
+
+set(mymap_gptj 1)
+set(mymap_falcon 2)
+set(mymap_gptneox 3)
+set(mymap_dolly 4)
+set(mymap_llama 5)
+set(mymap_mpt 6)
+set(mymap_starcoder 7)
+set(mymap_opt 8)
+set(mymap_bloom 9)
+set(mymap_chatglm2 10)
+set(mymap_chatglm 11)
+
 function(compile_run TARGET SRC MODEL_NAME MODEL_LIB)
  add_executable_w_warning(${TARGET} ${SRC})
   warning_check(${TARGET})
@@ -75,8 +90,16 @@ function(compile_run TARGET SRC MODEL_NAME MODEL_LIB)
   if(TARGET BUILD_INFO)
     add_dependencies(${TARGET} BUILD_INFO)
   endif()
+
+
+  pybind11_add_module("${MODEL_NAME}_cpp" main_pybind.cpp)
+  target_link_libraries("${MODEL_NAME}_cpp" PRIVATE ne_layers ${MODEL_LIB} common)
+  target_compile_definitions("${MODEL_NAME}_cpp" PUBLIC -DMODEL_NAME="${MODEL_NAME}" -DMODEL_NAME_ID=${mymap_${MODEL_NAME}})
 endfunction()
 
+
+
+
 compile_run(run_gptj      main_run.cpp gptj      gptj)
 compile_run(run_falcon    main_run.cpp falcon    falcon)
 compile_run(run_gptneox   main_run.cpp gptneox   gptneox)

diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/common.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/common.cpp
@@ -723,6 +723,8 @@ bool quant_params_parse(int argc, char** argv, quant_params& params) {
       quant_print_usage(argc, argv, params);
       exit(0);
     } else {
+      quant_print_usage(argc, argv, params);
+      fprintf(stderr, "unrecognized arguments: %s", arg.c_str());
       exit(0);
     }
   }