Skip to content

Commit

Permalink
Python api for cpp model (#252)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenwei-intel committed Sep 20, 2023
1 parent 2ab0e3b commit be651be
Show file tree
Hide file tree
Showing 20 changed files with 624 additions and 80 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker/codeScan.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \

RUN ln -sf $(which python3) /usr/bin/python

RUN python -m pip install --no-cache-dir pylint==2.12.1\
RUN python -m pip install --no-cache-dir pylint==2.17.5\
bandit==1.7.4\
pyspelling\
pydocstyle
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/script/formatScan/pylint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python -m pylint -f json --disable=R,C,W,E1129 \
--extension-pkg-whitelist=numpy,nltk \
--ignored-classes=TensorProto,NodeProto \
--ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py \
--ignore-paths=/intel-extension-for-transformers/intel_extension_for_transformers/llm/runtime/graph/ \
/intel-extension-for-transformers/intel_extension_for_transformers >${log_dir}/pylint.json
exit_code=$?

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ omit =
*/intel_extension_for_transformers/llm/finetuning/**
*/intel_extension_for_transformers/llm/inference/**
*/intel_extension_for_transformers/llm/quantization/**
*/intel_extension_for_transformers/llm/runtime/graph/**
exclude_lines =
pragma: no cover
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ omit =
*/intel_extension_for_transformers/llm/library/**
*/intel_extension_for_transformers/llm/operator/**
*/intel_extension_for_transformers/llm/runtime/deprecated/**
*/intel_extension_for_transformers/llm/runtime/graph/**
*/intel_extension_for_transformers/neural_chat/**
*/intel_extension_for_transformers/transformers/modeling/**
*/intel_extension_for_transformers/transformers/utils/get_throughput.py
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,6 @@
[submodule "intel_extension_for_transformers/llm/runtime/third_party/xbyak"]
path = intel_extension_for_transformers/llm/runtime/deprecated/third_party/xbyak
url = https://github.com/herumi/xbyak.git
[submodule "intel_extension_for_transformers/llm/runtime/graph/application/third_party/pybind11"]
path = intel_extension_for_transformers/llm/runtime/graph/application/third_party/pybind11
url = https://github.com/pybind/pybind11.git
2 changes: 1 addition & 1 deletion intel_extension_for_transformers/llm/runtime/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2021 Intel Corporation
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def __call__(self, model):


head_num_idx = False
head_num = 0
if pack_node.input_tensors[1].data != None:
head_num_idx = 1
head_num = int(pack_node.input_tensors[1].data)
Expand Down
104 changes: 57 additions & 47 deletions intel_extension_for_transformers/llm/runtime/graph/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,29 +52,67 @@ cmake ..
cmake --build . -j
```

### 2. Convert LLM
### 2. Run LLM with Python API

You can use the python api to simplely run HF model.
```python
from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig
model_name = "EleutherAI/gpt-j-6b" # support model id of HF or local PATH to model
woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_llm_runtime=True)
prompt = "Once upon a time, a little girl"
output = model.generate(prompt, streamer, max_new_tokens=30)
```

### 3. Run LLM with Script
You can use the following script to run, including convertion, quantization and inference.
```
python scripts/run_llm.py model-path --weight_dtype int4 -p "She opened the door and see"
```

LLM one-click running script args explanations:
| arg | explanation |
| -------------- | ----------------------------------------------------------------------- |
| model | directory containing model file or model id |
| --weight_dtype | data type of quantized weight (default: int4) |
| --alg | quantization algorithm to use: sym/asym (default: sym) |
| --block_size | block size (default: 32) |
| --scale_dtype | fp32/bf16 type for scales (default: fp32) |
| --compute_type | Gemm computation data type: int8/fp32/ggml (default: ggml) |
| -p / --prompt | prompt to start generation with (default: empty) |
| -n / --n_predict | number of tokens to predict (default: -1, -1 = infinity) |
| -t / --threads | number of threads to use during computation (default: 56) |
| -b / --batch_size | batch size for prompt processing (default: 512) |
| -c / --ctx_size | size of the prompt context (default: 512, can not be larger than specific model's context window length) |
| -s / --seed | NG seed (default: -1, use random seed for < 0) |
| --repeat_penalty | penalize repeat sequence of tokens (default: 1.1, 1.0 = disabled) |
| --color | colorise output to distinguish prompt and user input from generations |
| --keep | number of tokens to keep from the initial prompt (default: 0, -1 = all) |


## Advanced use

### 1. Convert and Quantize LLM model
LLM Runtime assumes the same model format as [llama.cpp](https://github.com/ggerganov/llama.cpp) and [ggml](https://github.com/ggerganov/ggml). You can also convert the model by following the below steps:

```bash
# download fp32 model (e.g., LLAMA2) from Hugging Face
git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf

# convert the pytorch model to ggml format
python scripts/convert_model.py --outtype f32 --outfile ne-f32.bin model_path
# convert the model directly use model id in Hugging Face. (recommended)
python scripts/convert.py --outtype f32 --outfile ne-f32.bin EleutherAI/gpt-j-6b

# or convert the model without downloading it by hand (llama and llama2 are WIP)
python scripts/convert_model.py --outtype f32 --outfile EleutherAI/gpt-j-6b
# or you can download fp32 model (e.g., LLAMA2) from Hugging Face at first, then convert the pytorch model to ggml format.
git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
python scripts/convert.py --outtype f32 --outfile ne-f32.bin model_path

# quantize weights of fp32 ggml bin
# model_name: llama, llama2, mpt, falcon, gptj, starcoder, dolly
# to neuarl engine graph optimized q4_j with 128 block_size format (recommended)
python scripts/quant_bin.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 128 --compute_type int8

# to ggml q4_0 format
python scripts/quant_bin.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_0.bin --weight_dtype int4
# to neuarl engine graph optimized q4_j with 32 block_size format
python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 128 --compute_type int8

python scripts/quant_bin.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 32 --compute_type int8
# Alternativly you could run ggml q4_0 format like following
python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_0.bin --weight_dtype int4
# or ues neuarl engine graph optimized q4_j with 32 block_size format
python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 32 --compute_type int8

```
quantization args explanations:
Expand All @@ -90,22 +128,20 @@ quantization args explanations:
| --scale_dtype | fp32/bf16 type for scales (default: fp32) |
| --compute_type | Gemm computation data type: int8/fp32/ggml (default: ggml) |

### 2. Inference model with C++ script API

### 3. Run Models

We supply LLM running python script to run supported models conveniently.

We supply LLM running script to run supported models with c++ api conveniently.
```bash
# recommed to use numactl to bind cores in Intel cpus for better performance
# if you use different core numbers, please also change -t arg value
# please type prompt about codes when run `StarCoder`, for example, -p "def fibonnaci(".
OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/run_llm.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see"
OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see"

# if you want to generate fixed outputs, please set --seed arg, for example:
OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/run_llm.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see" --seed 12
OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see" --seed 12

# if you want to reduce repeated generated texts, please set --repeat_penalty (value > 1.0, default = 1.0), for example:
OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/run_llm.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see" --repeat_penalty 1.2
OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python scripts/inference.py --model_name llama -m ne-q4_j.bin -c 512 -b 1024 -n 256 -t 56 --color -p "She opened the door and see" --repeat_penalty 1.2
```

LLM running script args explanations:
Expand All @@ -125,33 +161,7 @@ LLM running script args explanations:
| --glm_tokenizer | the path of the chatglm tokenizer (default: THUDM/chatglm-6b) |


### 4. One-click Script

You can use the following script to run, including convertion, quantization and inference.
```
python scripts/one_click_run.py model-path --weight_dtype int4 -p "She opened the door and see"
```

LLM one-click running script args explanations:
| arg | explanation |
| -------------- | ----------------------------------------------------------------------- |
| model | directory containing model file or model id |
| --weight_dtype | data type of quantized weight (default: int4) |
| --alg | quantization algorithm to use: sym/asym (default: sym) |
| --block_size | block size (default: 32) |
| --scale_dtype | fp32/bf16 type for scales (default: fp32) |
| --compute_type | Gemm computation data type: int8/fp32/ggml (default: ggml) |
| -p / --prompt | prompt to start generation with (default: empty) |
| -n / --n_predict | number of tokens to predict (default: -1, -1 = infinity) |
| -t / --threads | number of threads to use during computation (default: 56) |
| -b / --batch_size | batch size for prompt processing (default: 512) |
| -c / --ctx_size | size of the prompt context (default: 512, can not be larger than specific model's context window length) |
| -s / --seed | NG seed (default: -1, use random seed for < 0) |
| --repeat_penalty | penalize repeat sequence of tokens (default: 1.1, 1.0 = disabled) |
| --color | colorise output to distinguish prompt and user input from generations |
| --keep | number of tokens to keep from the initial prompt (default: 0, -1 = all) |

### 5. Tensor Parallelism cross nodes/sockets
### 3. Tensor Parallelism cross nodes/sockets

We support tensor parallelism strategy for distributed inference/training on multi-node and multi-socket. You can refer to [tensor_parallelism.md](./tensor_parallelism.md) to enable this feature.

96 changes: 96 additions & 0 deletions intel_extension_for_transformers/llm/runtime/graph/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from transformers import AutoConfig
from intel_extension_for_transformers.llm.runtime.graph.scripts.convert_model import convert_model

model_maps = {"gpt_neox": "gptneox", "RefinedWebModel": "falcon"}

class Model:
def __init__(self):
self.module = None
self.model = None
self.model_type = None
self.bin_file = None

def __import_package(self, model_name):
if self.module:
return
if model_name == "gptj":
import intel_extension_for_transformers.llm.runtime.graph.gptj_cpp as cpp_model
elif model_name == "falcon":
import intel_extension_for_transformers.llm.runtime.graph.falcon_cpp as cpp_model
elif model_name == "gptneox":
import intel_extension_for_transformers.llm.runtime.graph.gptneox_cpp as cpp_model
elif model_name == "dolly":
import intel_extension_for_transformers.llm.runtime.graph.dolly_cpp as cpp_model
elif model_name == "llama" or model_name == "llama2":
import intel_extension_for_transformers.llm.runtime.graph.llama_cpp as cpp_model
elif model_name == "mpt":
import intel_extension_for_transformers.llm.runtime.graph.mpt_cpp as cpp_model
elif model_name == "starcoder":
import intel_extension_for_transformers.llm.runtime.graph.starcoder_cpp as cpp_model
elif model_name == "opt":
import intel_extension_for_transformers.llm.runtime.graph.opt_cpp as cpp_model
elif model_name == "bloom":
import intel_extension_for_transformers.llm.runtime.graph.bloom_cpp as cpp_model
elif model_name == "chatglm2":
import intel_extension_for_transformers.llm.runtime.graph.chatglm2_cpp as cpp_model
else:
raise TypeError("Unspported model type {}!".format(model_name))
self.module = cpp_model

def init(self, model_name, **kwargs):
config = AutoConfig.from_pretrained(model_name)
model_type = model_maps.get(config.model_type, config.model_type)
self.__import_package(model_type)

# 1. convert model
fp32_bin = "ne_{}_f32.bin".format(model_type)
convert_model(model_name, fp32_bin, "f32")

# 2. quant model
quant_bin = "ne_{}_q.bin".format(model_type)
self.module.Model.quant_model(model_path = fp32_bin, out_path = quant_bin, **kwargs)

self.model_type = model_type
self.bin_file = quant_bin

# clean
os.remove(fp32_bin)


def init_from_bin(self, model_name, model_path, **kwargs):
self.__import_package(model_name)
self.model = self.module.Model()
self.model.init_model(model_path, **kwargs)

def quant_model(self, model_name, model_path, out_path, **kwargs):
self.__import_package(model_name)
self.module.Model.quant_model(model_path = model_path,
out_path = out_path, **kwargs)

def generate(self, prompt, streamer = None, sentence_mode = True, **kwargs):
# TODO support streamer
if self.model is None:
self.init_from_bin(self.model_type, self.bin_file, **kwargs)

out = self.model.generate(prompt = prompt, sentence_mode = sentence_mode)
return out

def is_token_end(self):
return self.model.is_token_end()
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@ compile_quant(quant_chatglm quant_model.cpp chatglm chatglm)
compile_quant(quant_chatglm2 quant_model.cpp chatglm2 chatglm2)

# all models running
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_subdirectory(third_party/pybind11)

set(mymap_gptj 1)
set(mymap_falcon 2)
set(mymap_gptneox 3)
set(mymap_dolly 4)
set(mymap_llama 5)
set(mymap_mpt 6)
set(mymap_starcoder 7)
set(mymap_opt 8)
set(mymap_bloom 9)
set(mymap_chatglm2 10)
set(mymap_chatglm 11)

function(compile_run TARGET SRC MODEL_NAME MODEL_LIB)
add_executable_w_warning(${TARGET} ${SRC})
warning_check(${TARGET})
Expand All @@ -75,8 +90,16 @@ function(compile_run TARGET SRC MODEL_NAME MODEL_LIB)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()


pybind11_add_module("${MODEL_NAME}_cpp" main_pybind.cpp)
target_link_libraries("${MODEL_NAME}_cpp" PRIVATE ne_layers ${MODEL_LIB} common)
target_compile_definitions("${MODEL_NAME}_cpp" PUBLIC -DMODEL_NAME="${MODEL_NAME}" -DMODEL_NAME_ID=${mymap_${MODEL_NAME}})
endfunction()




compile_run(run_gptj main_run.cpp gptj gptj)
compile_run(run_falcon main_run.cpp falcon falcon)
compile_run(run_gptneox main_run.cpp gptneox gptneox)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,8 @@ bool quant_params_parse(int argc, char** argv, quant_params& params) {
quant_print_usage(argc, argv, params);
exit(0);
} else {
quant_print_usage(argc, argv, params);
fprintf(stderr, "unrecognized arguments: %s", arg.c_str());
exit(0);
}
}
Expand Down

0 comments on commit be651be

Please sign in to comment.