|
| 1 | +# Prompt template |
| 2 | + |
| 3 | +This document will show some examples to introduce how to correctly use prompt templates in Neural Speed and [ITREX](https://github.com/intel/intel-extension-for-transformers). |
| 4 | + |
| 5 | +For the base model (without SFT for pre-training), prompt can be directly encoded into token ids without adding any special prefix or suffix token. But for the chat model, we need some prompt templates to generate correct and human understandable words. The reason is that these models are usually trained with specific prompt templates. |
| 6 | + |
| 7 | +## Chat with ChatGLM3: |
| 8 | +```python |
| 9 | +from transformers import AutoTokenizer, TextStreamer |
| 10 | +from neural_speed import Model |
| 11 | + |
| 12 | +prompt = "你好" |
| 13 | +tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) |
| 14 | +inputs = tokenizer.build_chat_input(prompt)['input_ids'] |
| 15 | +model = Model() |
| 16 | +model.init_from_bin(args.model_name, gguf_path) |
| 17 | +outputs = model.generate(inputs, max_new_tokens=300, do_sample=True) |
| 18 | +words = tokenizer.decode(outputs[0]) |
| 19 | +``` |
| 20 | + |
| 21 | +## Chat with LLaMA2: |
| 22 | + |
| 23 | +```python |
| 24 | +from transformers import AutoTokenizer, TextStreamer |
| 25 | +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig |
| 26 | + |
| 27 | +# Please change to local path to model, llama2 does not support online conversion, currently. |
| 28 | +model_name = "meta-llama/Llama-2-7b-chat-hf" |
| 29 | +woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") |
| 30 | +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| 31 | +streamer = TextStreamer(tokenizer) |
| 32 | +model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True) |
| 33 | + |
| 34 | +while True: |
| 35 | + prompt = input("> ").strip() |
| 36 | + if prompt == "quit": |
| 37 | + break |
| 38 | + b_prompt = "[INST]{}[/INST]".format(prompt) # prompt template for llama2 |
| 39 | + inputs = tokenizer(b_prompt, return_tensors="pt").input_ids |
| 40 | + outputs = model.generate(inputs, streamer=streamer, interactive=True, ignore_prompt=True, do_sample=True) |
| 41 | +``` |
| 42 | + |
| 43 | +## Chat with ChatGLM2: |
| 44 | +```python |
| 45 | +from transformers import AutoTokenizer, TextStreamer |
| 46 | +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig |
| 47 | + |
| 48 | +model_name = "THUDM/chatglm2-6b" # or local path to model |
| 49 | +woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") |
| 50 | +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| 51 | +streamer = TextStreamer(tokenizer) |
| 52 | +model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True) |
| 53 | + |
| 54 | +while True: |
| 55 | + prompt = input("> ").strip() |
| 56 | + if prompt == "quit": |
| 57 | + break |
| 58 | + prompt = tokenizer.build_prompt(prompt) # prompt template for chatglm2 |
| 59 | + inputs = tokenizer([prompt], return_tensors="pt").input_ids |
| 60 | + outputs = model.generate(inputs, streamer=streamer, interactive=True, ignore_prompt=True, do_sample=True, n_keep=2) |
| 61 | +``` |
| 62 | + |
| 63 | +## Chat with Qwen: |
| 64 | +```python |
| 65 | +from transformers import AutoTokenizer, TextStreamer |
| 66 | +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig |
| 67 | + |
| 68 | +model_name = "Qwen/Qwen-7B-Chat" # or local path to model |
| 69 | +woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") |
| 70 | +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| 71 | +streamer = TextStreamer(tokenizer) |
| 72 | +model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True) |
| 73 | + |
| 74 | +while True: |
| 75 | + prompt = input("> ").strip() |
| 76 | + if prompt == "quit": |
| 77 | + break |
| 78 | + prompt = "\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n".format(prompt) # prompt template for qwen |
| 79 | + inputs = tokenizer([prompt], return_tensors="pt").input_ids |
| 80 | + outputs = model.generate(inputs, streamer=streamer, interactive=True, ignore_prompt=True, do_sample=True) |
| 81 | +``` |
0 commit comments