From e5ed8270d4d89bf68757f967676db57292c71920 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 20 Nov 2023 18:50:32 +0800
Subject: [PATCH] LLM doc refinement (#2283)

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md
---
 examples/cpu/inference/python/llm/README.md | 207 +++++++++++---------
 1 file changed, 109 insertions(+), 98 deletions(-)
diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md
index 609ec3b54..01925a343 100644
--- a/examples/cpu/inference/python/llm/README.md
+++ b/examples/cpu/inference/python/llm/README.md
@@ -80,93 +80,94 @@ You can run LLM with a one-click Python script "run.py" for all inference cases.
 ```
 python run.py --help # for more detailed usages
 ```
+| Key args of run.py | Notes | 
+|---|:---:|
+| generation | default: beam search (beam size = 4), "--greedy" for greedy search |
+| input tokens | default: 32, provide fixed sizes for input prompt size, use "--input-tokens" for [32, 64, 128, 256, 512, 1024, 2016, 2017, 2048, 4096, 8192]; if "--input-tokens" is not used, use "--prompt" to choose other strings as inputs|
+| output tokens | default: 32, use "--max-new-tokens" to choose any other size |
+| batch size |  default: 1, use "--batch-size" to choose any other size |
+| token latency |  enable "--token-latency" to print out the first or next token latency |
+| generation iterations |  use "--num-iter" and "--num-warmup" to control the repeated iterations of generation, default: 100-iter/10-warmup |
+
 ## Example usages of one-click Python script
-### Single Instance Performance
+### Single Instance inference
+#### Prepare:
 ```bash
 # Get prompt file to the path of scripts
 cp prompt.json ./single_instance
 export WORK_DIR=./
-
-# bf16 benchmark
+```
+#### BF16:
+```bash
+# general command:
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
 
-# weight only quantization int8 benchmark
+# An example of llama2 7b model:
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --deployment-mode
+```
+#### Weight-only quantization:
+```bash
+# int8 general command:
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py  --benchmark -m <MODEL_ID> --ipex-weight-only-quantization  --output-dir "saved_results" --int8-bf16-mixed
 
-# weight only quantization int4 benchmark
+# An example of llama2 7b model:
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization  --output-dir "saved_results" --int8-bf16-mixed
+
+
+# int4 general command:
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py  --benchmark -m <MODEL_ID> --ipex-weight-only-quantization --gptq --output-dir "saved_results" --int8-bf16-mixed
+# for GPT-NEOX Weight-only quantizations, using "--int8" instead of "--int8-bf16-mixed" for accuracy concerns.
 
-# static quantization int8 benchmark
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py  --benchmark -m <MODEL_ID> --ipex-smooth-quant --alpha <Tuned alpha for specific models> --output-dir "saved_results" --int8-bf16-mixed
+# An example of llama2 7b model:
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization  --output-dir "saved_results" --int8-bf16-mixed --gptq
+```
+#### Static quantization (int8):
+```bash
+# general command:
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py  --benchmark -m <MODEL_ID> --ipex-smooth-quant --alpha <Tuned alpha for specific models> --output-dir "saved_results" --int8
 # For the best alpha values (range [0, 1.0], float) tuned for specific models, we verified good accuracy: "EleutherAI/gpt-j-6b" with alpha=1.0, "meta-llama/Llama-2-7b-chat-hf" with alpha=0.8.
-# For other variant models, suggest using default alpha=0.5, and could be further tuned in the range [0, 1.0]. (suggest step_size of 0.05)
+# For more recipes, please refer to https://github.com/intel/neural-compressor/blob/master/docs/source/smooth_quant.md#validated-models
+# Note: by default, we use "--int8" to run int8 mixed fp32 mode, while for peak performance of static quantization, please use "--int8-bf16-mixed" instead (may impact accuracy).
+
+# An example of llama2 7b model:
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-chat-hf --ipex-smooth-quant --alpha 0.8 --output-dir "saved_results" --int8
+```
+*Notes for all quantizations:*
+
+(1) for quantization benchmarks, the first runs will auto-generate the quantized model named "best_model.pt" in the "--output-dir" path, you can reuse these quantized models for inference-only benchmarks by adding "--quantized-model-path <output_dir + "best_model.pt">".
 
-Notes:
-(1) for quantization benchmarks, the first runs will auto-generate the quantized model named "best_model.pt" in the "--output-dir" path, you can reuse these quantized models for inference-only benchmarks by using "--quantized-model-path <output_dir + "best_model.pt">".
 (2) for Falcon quantizations, "--config-file <CONFIG_FILE>" is needed and example of <CONFIG_FILE>: "utils/model_config/tiiuae_falcon-40b_config.json".
-(3) for GPT-NEOX quantizations, using "--int8" instead of "--int8-bf16-mixed" for accuracy concerns.
-(4) By default, generations are based on "beam search", and beam size = 4. For beam size = 1, please add "--greedy"
 
-```
-### Distributed Performance with DeepSpeed (autoTP)
+### Distributed inference with DeepSpeed (autoTP)
+#### Prepare:
 ```bash
 # Get prompt file to the path of scripts
 cp prompt.json ./distributed
 export WORK_DIR=./
 unset KMP_AFFINITY
-
-# bf16 benchmark
+# By default, we use "--shard-model" for better memory usage, if your model path is already sharded, please remove "--shard-model"
+```
+#### BF16:
+```bash
+# general command:
 deepspeed --bind_cores_to_rank  run.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode --autotp --shard-model
 
-# weight only quantization int8 benchmark
-deepspeed --bind_cores_to_rank run.py  --benchmark -m <MODEL_ID> --ipex --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed --autotp --shard-model
-
-Notes:
-(1) for Falcon quantizations, "--config-file <CONFIG_FILE>" is needed and example of <CONFIG_FILE>: "utils/model_config/tiiuae_falcon-40b_config.json".
-(2) for GPT-NEOX quantizations, using "--int8" instead of "--int8-bf16-mixed", and "--dtype float32" for accuracy concerns.
-(3) by default, we use "--shard-model" for better memory usage, if your model is already sharded, please remove "--shard-model"
-(4) By default, generations are based on "beam search", and beam size = 4. For beam size = 1, please add "--greedy"
-
+# An example of llama2 7b model:
+deepspeed --bind_cores_to_rank  run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --deployment-mode --autotp --shard-model
 ```
-
-# Advanced Usage
-## Single Instance Performance
+#### Weight-only quantization:
 ```bash
-# Get prompt file to the path of scripts
-export WORK_DIR=./
-cd single_instance
-cp PATH/TO/prompt.json ./
-# bfloat16 benchmark
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run_generation.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
-
-# quantization benchmark
-#To run quantization performance, you need to firstly get the quantized model with the following step (1) and then run the performance benchmark with the following step (2)
-## (1) Do quantization to get the quantized model
-## note: llama/gptj we have both IPEX smooth quant and weight-only-quantization, while for rest models, we recommend weight-only-quantization
-mkdir saved_results
-
-## GPT-J quantization
-python run_gpt-j_quantization.py --ipex-smooth-quant --alpha <Tuned alpha for specific models> --output-dir "saved_results" --int8-bf16-mixed -m <GPTJ MODEL_ID>
-## Llama 2 quantization
-python run_llama_quantization.py --ipex-smooth-quant --alpha <Tuned alpha for specific models> --output-dir "saved_results" --int8-bf16-mixed -m <LLAMA MODEL_ID>
-# For the best alpha values (range [0, 1.0], float) tuned for specific models, we verified good accuracy: "EleutherAI/gpt-j-6b" with alpha=1.0, "meta-llama/Llama-2-7b-chat-hf" with alpha=0.8.
-# For other variant models, suggest using default alpha=0.5, and could be further tuned in the range [0, 1.0]. (suggest step_size of 0.05)
-
-## GPT-NEOX quantization
-python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8 -m <GPT-NEOX MODEL_ID>
-## Falcon quantization (example of config-file: utils/model_config/tiiuae_falcon-40b_config.json)
-python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <FALCON MODEL_ID> --config-file <CONFIG_FILE>
-## OPT quantization
-python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <OPT MODEL_ID>
-## CodeGen quantization
-python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <CODEGEN MODEL_ID>
-
-## (2) Run quantization performance test (note that GPT-NEOX uses --int8 instead of --int8-bf16-mixed)
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_<MODEL>_quantization.py -m <MODEL_ID> --quantized-model-path "./saved_results/best_model.pt" --benchmark --int8-bf16-mixed
+# int8 general command:
+deepspeed --bind_cores_to_rank run.py  --benchmark -m <MODEL_ID> --ipex --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed --autotp --shard-model
+# for Falcon quantizations, "--config-file <CONFIG_FILE>" is needed and example of <CONFIG_FILE>: "utils/model_config/tiiuae_falcon-40b_config.json".
+# for GPT-NEOX weight-only quantizations, using "--int8" instead of "--int8-bf16-mixed", and add "--dtype float32" for accuracy concerns.
 
+# An example of llama2 7b model:
+deepspeed --bind_cores_to_rank  run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed --autotp --shard-model
 ```
 
-## Weight only quantization with low precision checkpoint (Experimental)
+# Advanced Usage
+## Weight-only quantization with low precision checkpoint (Experimental)
 Using INT4 weights can further improve performance by reducing memory bandwidth. However, direct per-channel quantization of weights to INT4 probably results in poor accuracy. Some algorithms can modify weights through calibration before quantizing weights to minimize accuracy drop. GPTQ is one of such algorithms. You may generate modified weights and quantization info (scales, zero points) for a certain model with a some dataset by such algorithms. The results are saved as a `state_dict` in a `.pt` file. We provided a script here to run GPTQ (Intel(R) Neural Compressor 2.3.1 is required).
 
 Here is how to use it:
@@ -228,55 +229,65 @@ Please note that 100 GB disk space, 100 GB memory and Internet access are needed
 IPEX now only supports some certain cases. Weights must be N by K and per-channel asymmetrically quantized (group size = -1) to UINT4 and then compressed along K axis to `torch.int32`.
 Data type of scales can be any floating point types. Shape of scales should be [N] or with additional dimensions whose length is 1, e.g., [N, 1] or [1, N]. Zero points should have the same shape as scales and stored as `torch.int32` but the true data type is UINT4. Bias is optional in the `state_dict` (checkpoint). If it is present, we read bias in the `state_dict`. Otherwise we read bias from the original model. Bias is `None` if it cannot be found in both cases.
 
-## Single Instance Accuracy
+## Accuracy test:
+We leverage [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for the accuracy test.
+By default we test "lambada_standard" task, for more choice, see {TASK_NAME} in this [link](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md), 
+### Single Instance
 ```bash
-Accuracy test {TASK_NAME}, choice in this [link](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md), by default we use "lambada_standard"
-
-# bfloat16
+cd ./single_instance
+```
+### BF16:
+```bash
+# general command:
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run_accuracy.py --accuracy-only -m <MODEL_ID> --dtype bfloat16 --ipex --jit --tasks {TASK_NAME}
 
-# Quantization as a performance part
-# (1) Do quantization to get the quantized model as mentioned above
-# (2) Run int8 accuracy test (note that GPT-NEOX please remove --int8-bf16-mixed)
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_accuracy.py --model <MODEL ID> --quantized-model-path "./saved_results/best_model.pt" --dtype int8 --accuracy-only --jit --int8-bf16-mixed --tasks {TASK_NAME}
-```
-## Shard model for Distributed Performance
-```
-# We need to make sure the model is well shard before we test Distributed Performance with DeepSpeed (saving memory usage purpose)
-export WORK_DIR=./
-cd utils
-python create_shard_model.py -m <MODEL ID>  --save-path <SHARD MODEL NEW PATH>
-# After sharding the model, using -m <SHARD MODEL NEW PATH> in later tests.
+# An example of llama2 7b model:
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py --accuracy-only -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --jit --tasks lambada_openai
 ```
-## Distributed Performance with DeepSpeed (autoTP)
+### Quantizations:
 ```bash
-unset KMP_AFFINITY
+# general command:
+# For the quantized models to be used in accuracy tests, we can reuse the model files that are named "best_model.pt" in the "--output-dir" path (generated during inference performance tests).
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_accuracy.py --model <MODEL ID> --quantized-model-path "./saved_results/best_model.pt" --dtype int8 --accuracy-only --jit --tasks {TASK_NAME}
+# please also add  "--int8-bf16-mixed" if your model is quantized with this flag
 
-# Get prompt file to the path of scripts
-export WORK_DIR=./
-cd distributed
-mv PATH/TO/prompt.json ./
-
-# Run GPTJ/LLAMA/OPT/Falcon/CodeGen with bfloat16 DeepSpeed
-deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
-
-# Run GPT-NeoX with ipex weight only quantization
-deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m EleutherAI/gpt-neox-20b --dtype float32 --ipex --ipex-weight-only-quantization --deployment-mode
+# An example of llama2 7b model:
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Llama-2-7b-hf --quantized-model-path "./saved_results/best_model.pt" --dtype int8 --accuracy-only --jit --int8 --tasks lambada_openai
 ```
-
-## Distributed Accuracy with DeepSpeed (autoTP)
+### Distributed with DeepSpeed (autoTP)
+### Prepare:
 ```bash
-# Run distributed accuracy with 2 ranks of one node for bfloat16 with ipex and jit
-source ${ONECCL_DIR}/build/_install/env/setvars.sh
-
-export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so:${CONDA_PREFIX}/lib/libtcmalloc.so
-export LD_LIBRARY_PATH=${ONECCL_DIR}/lib:$LD_LIBRARY_PATH
+# Run distributed accuracy with 2 ranks of one node
+cd ./distributed
 unset KMP_AFFINITY
-
+```
+### BF16:
+```bash
+# general command:
 deepspeed  --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model <MODEL_ID> --dtype bfloat16 --ipex --jit --tasks <TASK_NAME> --accuracy-only
 
-# with weight only quantization
-
+# An example of llama2 7b model:
+deepspeed  --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model  meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --jit --tasks lambada_openai --accuracy-only 
+```
+### Weight-only quantization:
+```bash
+# general command:
 deepspeed  --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model <MODEL_ID> --int8-bf16-mixed --ipex --jit --tasks <TASK_NAME> --accuracy-only --ipex-weight-only-quantization
+# note that GPT-NEOX please remove "--int8-bf16-mixed" and add "--dtype float32" for accuracy concerns
+
+# An example of llama2 7b model:
+deepspeed  --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model  meta-llama/Llama-2-7b-hf --int8-bf16-mixed --ipex --jit --tasks <TASK_NAME> --accuracy-only --ipex-weight-only-quantization
+```
+
+## How to Shard model for Distributed tests with DeepSpeed (autoTP)
+```
+# For saving memory usage, we could shard the model weights under the local path before we launch distributed tests with DeepSpeed
+export WORK_DIR=./
+cd utils
+# general command:
+python create_shard_model.py -m <MODEL ID>  --save-path <SHARD MODEL NEW PATH>
+# After sharding the model, using -m <SHARD MODEL NEW PATH> in later tests
 
+# An example of llama2 7b:
+python create_shard_model.py meta-llama/Llama-2-7b-hf --save-path ./local_llama2_7b
 ```