update llama model

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
intel · Nov 17, 2023 · 7f2063f · 7f2063f
1 parent 1421368
commit 7f2063f
Show file tree

Hide file tree

Showing 8 changed files with 38 additions and 16 deletions.
diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
@@ -759,21 +759,21 @@
     "llama-7b-rtn": {
       "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
       "dataset_location": "",
-      "input_model": "/tf_dataset2/models/onnx/llama-7b",
+      "input_model": "/tf_dataset2/models/onnx/llama-2-7b",
       "main_script": "main.py",
       "batch_size": 1
     },
     "llama-7b-awq": {
       "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
       "dataset_location": "",
-      "input_model": "/tf_dataset2/models/onnx/llama-7b",
+      "input_model": "/tf_dataset2/models/onnx/llama-2-7b",
       "main_script": "main.py",
       "batch_size": 1
     },
     "llama-7b-gptq": {
       "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
       "dataset_location": "",
-      "input_model": "/tf_dataset2/models/onnx/llama-7b",
+      "input_model": "/tf_dataset2/models/onnx/llama-2-7b",
       "main_script": "main.py",
       "batch_size": 1
     },

diff --git a/...t/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md b/...t/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md
@@ -14,10 +14,20 @@ pip install -r requirements.txt
 
 ## 2. Prepare Model
 
+Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for quantization. The following table shows a few models' configurations:
+
+| Model | Num Hidden Layers| Num Attention Heads | Hidden Size |
+| --- | --- | --- | --- |
+| [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 32 | 32 | 4096 |
+| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 32 | 32 | 4096 |
+| [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 40 | 40 | 5120 |
+| [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 |
+| [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 |
+| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 |
+
+Export to ONNX model:
 ```bash
-python prepare_model.py  --input_model="decapoda-research/llama-7b-hf" --output_model="./llama_7b"
-# or
-python prepare_model.py  --input_model="decapoda-research/llama-13b-hf" --output_model="./llama_13b"
+optimum-cli export onnx --model meta-llama/Llama-2-7b-hf --task text-generation-with-past ./Llama-2-7b-hf
 ```
 
 # Run
@@ -30,7 +40,7 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
                   --batch_size=batch_size # optional \
                   --dataset NeelNanda/pile-10k \
                   --alpha 0.6 \ # 0.6 for llama-7b, 0.8 for llama-13b
-                  --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                  --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                   --quant_format="QOperator" # or QDQ, optional
 ```
 
@@ -42,7 +52,7 @@ Accuracy:
 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
                       --batch_size=batch_size \ # optional 
                       --mode=accuracy \
-                      --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                      --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                       --tasks=lambada_openai
 ```
 
@@ -51,6 +61,6 @@ Performance:
 numactl -m 0 -C 0-3 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
                                           --mode=performance \
                                           --batch_size=batch_size # optional \
-                                          --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                                          --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                                           --intra_op_num_threads=4
 ```
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py
@@ -72,7 +72,7 @@
     '--tokenizer',
     type=str,
     help="pretrained model name or path of tokenizer files",
-    default="decapoda-research/llama-7b-hf"
+    default="meta-llama/Llama-2-7b-hf"
 )
 parser.add_argument(
     '--workspace',

diff --git a/...s/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh b/...s/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/run_quant.sh
@@ -62,7 +62,7 @@ function run_tuning {
     python main.py \
             --quant_format ${quant_format-QOperator} \
             --model_path ${input_model} \
-	    --tokenizer ${tokenizer-decapoda-research/llama-7b-hf} \
+	    --tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \
             --output_model ${output_model} \
             --batch_size ${batch_size-1} \
             --smooth_quant_alpha ${alpha-0.6} \

diff --git a/.../nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md b/.../nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md
@@ -14,8 +14,20 @@ pip install -r requirements.txt
 
 ## 2. Prepare Model
 
+Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for weight-only quantization. The following table shows a few models' configurations:
+
+| Model | Num Hidden Layers| Num Attention Heads | Hidden Size |
+| --- | --- | --- | --- |
+| [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 32 | 32 | 4096 |
+| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 32 | 32 | 4096 |
+| [meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 40 | 40 | 5120 |
+| [meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 40 | 40 | 5120 |
+| [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 80 | 64 | 8192 |
+| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 80 | 64 | 8192 |
+
+Export to ONNX model:
 ```bash
-optimum-cli export onnx --model decapoda-research/llama-7b-hf --task text-generation-with-past ./llama_7b
+optimum-cli export onnx --model meta-llama/Llama-2-7b-hf --task text-generation-with-past ./Llama-2-7b-hf
 ```
 
 # Run
@@ -36,6 +48,6 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
 ```bash
 bash run_benchmark.sh --input_model=path/to/model \ # folder path of onnx model
                       --batch_size=batch_size \ # optional 
-                      --tokenizer=decapoda-research/llama-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                      --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
                       --tasks=lambada_openai
 ```
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
@@ -68,7 +68,7 @@
     "--tokenizer",
     type=str,
     help="pretrained model name or path of tokenizer files",
-    default="decapoda-research/llama-7b-hf"
+    default="meta-llama/Llama-2-7b-hf"
 )
 parser.add_argument(
     "--workspace",

diff --git a/...xrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh b/...xrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_benchmark.sh
@@ -39,7 +39,7 @@ function run_benchmark {
     python main.py \
             --model_path ${input_model} \
             --batch_size=${batch_size-1} \
-            --tokenizer=${tokenizer-decapoda-research/llama-7b-hf} \
+            --tokenizer=${tokenizer-meta-llama/Llama-2-7b-hf} \
             --tasks=${tasks-lambada_openai} \
             --benchmark
 

diff --git a/.../onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh b/.../onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/run_quant.sh
@@ -58,7 +58,7 @@ function run_tuning {
 
     python main.py \
             --model_path ${input_model} \
-	    --tokenizer ${tokenizer-decapoda-research/llama-7b-hf} \
+	    --tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \
             --output_model ${output_model} \
             --batch_size ${batch_size-1} \
             --dataset ${dataset-NeelNanda/pile-10k} \