Enable OPT and LLAMA models (#877)

intel · Apr 27, 2023 · 6a96085 · 6a96085
1 parent 3ade86f
commit 6a96085
Show file tree

Hide file tree

Showing 14 changed files with 366 additions and 60 deletions.
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -1282,4 +1282,67 @@
       }
     }
   },
+  "opt_2.7b_clm_ipex": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "opt_2.7b_clm_ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "opt_2.7b_clm_ipex",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "opt_6.7b_clm_ipex": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "opt_6.7b_clm_ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "opt_6.7b_clm_ipex",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "llama_7b_clm_ipex": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "llama_7b_clm_ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "llama_7b_clm_ipex",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
 }
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md
@@ -17,7 +17,8 @@ conda install gperftools jemalloc==5.2.1 -c conda-forge -y
 
 # Installation
 pip install git+https://github.com/intel-innersource/frameworks.ai.nlp-toolkit.intel-nlp-toolkit.git
-pip install neural_compressor intel_extension_for_pytorch transformers datasets accelerate
+pip install git+https://github.com/intel/neural-compressor.git@6efe818497d5e424deac42580f9fde84f8b8723d
+pip install intel_extension_for_pytorch transformers datasets accelerate
 
 # Setup Environment Variables
 export KMP_BLOCKTIME=1
@@ -30,23 +31,68 @@ export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 ```
 
 # Run
-## 1. Quantization
 
 Here is how to run the scripts:
 
 **Causal Language Modeling (CLM)**
 
-`evaluate_clm.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_evaluation_harness, an example command is as follows.
+`evaluate_clm.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
+### GPT-J-6b
+
+#### Quantization
 ```bash
 # "--sq" is used to enable smooth quant
 # "--int8_bf16_mixed" is used to enable int8-bf16 mixed mode for platform that natively supports bf16
 python evaluate_clm.py \
     --model EleutherAI/gpt-j-6B \
     --quantize \
-    --dataset lambada \
+    --dataset NeelNanda/pile-10k \
+    --sq \
+    --alpha 0.5 \
+    --output_dir "saved_results" \
+```
+
+#### Accuracy with lm_eval
+```bash
+# FP32 Accuracy
+python evaluate_clm.py \
+    --model EleutherAI/gpt-j-6B \
+    --accuracy_only \
+    --batch_size 112 \
+    --tasks "lambada_openai" "lambada_standard"\
+    --int8 \
+    --output_dir "saved_results"  # load int8 model
+# to validate IPEX BF16 model, please use "--ipex_bf16" to replace "--int8" and remove "--output_dir".
+# to validate FP32 model, please remove "--int8" and "--output_dir".
+```
+### OPT-2.7b
+
+#### Quantization
+
+```bash
+# "--sq" is used to enable smooth quant
+# "--int8_bf16_mixed" is used to enable int8-bf16 mixed mode for platform that natively supports bf16
+python evaluate_clm.py \
+    --model facebook/opt-2.7b \
+    --quantize \
+    --dataset NeelNanda/pile-10k \
     --sq \
-    --alpha 0.7 \
-    --output_dir "saved_results"
+    --alpha 0.5 \
+    --output_dir "saved_results" \
+    --int8_bf16_mixed
+```
+
+#### Accuracy with lm_eval
+```bash
+python evaluate_clm.py \
+    --model facebook/opt-2.7b \
+    --accuracy_only \
+    --batch_size 112 \
+    --tasks "winogrande" "copa" "piqa" "rte" "hellaswag" "openbookqa" "lambada_openai" "lambada_standard" \
+    --int8 \
+    --output_dir "saved_results"  # load int8 model
+# to validate IPEX BF16 model, please use "--ipex_bf16" to replace "--int8" and remove "--output_dir".
+# to validate FP32 model, please remove "--int8" and "--output_dir".
 ```
 To do quantization based transformers language-modeling example [`run_clm.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py), please use the following command.
 ```
@@ -93,30 +139,8 @@ python run_mlm.py \
     --overwrite_output_dir
 
 ```
-## 2. Accuracy
-```bash
-# FP32 Accuracy
-python evaluate_clm.py \
-    --model EleutherAI/gpt-j-6B \
-    --accuracy_only \
-    --batch_size 56
-
-# BF16 Accuracy
-python evaluate_clm.py \
-    --model EleutherAI/gpt-j-6B \
-    --accuracy_only \
-    --ipex_bf16 \
-    --batch_size 56
-
-# INT8 Accuracy
-python evaluate_clm.py \
-    --model EleutherAI/gpt-j-6B \
-    --accuracy_only \
-    --int8 \
-    --batch_size 56
-```
 
-## 3. Validated Model List
+## 2. Validated Model List
 
 |Type|Pretrained model|PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining
 |---|------------------------------------|---|---|---

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/evaluate_clm.py b/examples/huggingface/pytorch/language-modeling/quantization/evaluate_clm.py
@@ -4,13 +4,14 @@
 import json
 import fnmatch
 import re
-import numpy as np
 import torch
 from datasets import load_dataset
 from torch.nn.functional import pad
 from torch.utils.data import DataLoader
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import intel_extension_for_pytorch as ipex
+
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--model", nargs="?", default="EleutherAI/gpt-j-6b"
@@ -37,13 +38,17 @@
                     help="For accuracy measurement only.")
 parser.add_argument("--save_accuracy_path", default=None,
                     help="Save accuracy results path.")
-parser.add_argument("--pad_max_length", default=196, type=int,
+parser.add_argument("--pad_max_length", default=512, type=int,
                     help="Pad input ids to max length.")
+parser.add_argument("--calib_iters", default=512, type=int,
+                    help="calibration iters.")
+parser.add_argument("--tasks", nargs='+', default=["winogrande", "copa", "piqa", "rte", "hellaswag", \
+                    "openbookqa", "lambada_openai", "lambada_standard", "wikitext"], type=str, \
+                    help="tasks list for accuracy validation")
 
 args = parser.parse_args()
 calib_size = 1
 
-
 class Evaluator:
     def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False):
         self.dataset = dataset
@@ -116,11 +121,19 @@ def evaluate(self, model):
         print("Latency: ", latency)
         return acc
 
-user_model = AutoModelForCausalLM.from_pretrained(
-    args.model,
-    torchscript=True,  # torchscript will force `return_dict=False` to avoid jit errors
-)
-tokenizer = AutoTokenizer.from_pretrained(args.model)
+if re.search("llama", args.model):
+    from transformers import LlamaForCausalLM, LlamaTokenizer
+    user_model = LlamaForCausalLM.from_pretrained(
+        args.model,
+        torchscript=True,  # torchscript will force `return_dict=False` to avoid jit errors
+    )
+    tokenizer = LlamaTokenizer.from_pretrained(args.model)
+else:
+    user_model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torchscript=True,  # torchscript will force `return_dict=False` to avoid jit errors
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
 
 
 # to channels last
@@ -142,7 +155,7 @@ def evaluate(self, model):
 
     def calib_func(prepared_model):
         for i, calib_input in enumerate(calib_dataloader):
-            if i > 100:
+            if i > args.calib_iters:
                 break
             prepared_model(calib_input[0])
 
@@ -200,19 +213,23 @@ def calib_func(prepared_model):
 if args.accuracy_only:
     user_model.eval()
     def eval_func(user_model):
-        from intel_extension_for_transformers.evaluation.lm_evaluation_harness.evaluator import evaluate
+        from intel_extension_for_transformers.evaluation import evaluate
         results = evaluate(
             model="hf-causal",
             model_args='pretrained='+args.model+',tokenizer='+args.model+',dtype=float32',
             user_model=user_model,
             batch_size=args.batch_size,
-            tasks=["lambada_openai"]
+            tasks=args.tasks,
         )
         dumped = json.dumps(results, indent=2)
         if args.save_accuracy_path:
             with open(args.save_accuracy_path, "w") as f:
                 f.write(dumped)
-        print('Accuracy for lambada_openai is ', results["results"]["lambada_openai"]["acc"])
+        for task_name in args.tasks:
+            if task_name == "wikitext":
+                print("Accuracy for %s is: %s", % task_name, % results["results"][task_name]["word_perplexity"])
+            else:
+                print("Accuracy for %s is: %s", % task_name, % results["results"][task_name]["acc"])
 
     with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled, dtype=amp_dtype):
         eval_func(user_model)

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt b/examples/huggingface/pytorch/language-modeling/quantization/requirements.txt
@@ -5,5 +5,5 @@ datasets >= 1.1.3
 torch >= 1.10
 transformers
 wandb
-neural_compressor
+git+https://github.com/intel/neural-compressor.git@6efe818497d5e424deac42580f9fde84f8b8723d
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
@@ -13,6 +13,7 @@ function init_params {
   iters=100
   batch_size=16
   tuned_checkpoint=saved_results
+  tasks="lambada_openai  lambada_standard"
   for var in "$@"
   do
     case $var in
@@ -92,6 +93,18 @@ function run_benchmark {
         script="evaluate_clm.py"
         model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
         approach="PostTrainingStatic"
+    elif [ "${topology}" = "opt_2.7b_clm_ipex" ]; then
+        script="evaluate_clm.py"
+        model_name_or_path="facebook/opt-2.7b"
+        approach="PostTrainingStatic"
+    elif [ "${topology}" = "opt_6.7b_clm_ipex" ]; then
+        script="evaluate_clm.py"
+        model_name_or_path="facebook/opt-6.7b"
+        approach="PostTrainingStatic"
+    elif [ "${topology}" = "llama_7b_clm_ipex" ]; then
+        script="evaluate_clm.py"
+        model_name_or_path="decapoda-research/llama-7b-hf"
+        approach="PostTrainingStatic"
     elif [ "${topology}" = "bert_mlm_static" ]; then
         script="run_mlm.py"
         DATASET_NAME="wikitext"
@@ -154,24 +167,26 @@ function run_benchmark {
 
     if [[ ${int8} == "true" ]]; then
         extra_cmd=$extra_cmd" --int8"
-        model_name_or_path=${tuned_checkpoint}
+        if [ ${script} != "evaluate_clm.py" ];then
+            model_name_or_path=${tuned_checkpoint}
+        fi
     fi
 
-    if [[ ${int8} == "true" ]] && [ "${topology}" = "gpt_j_6b_clm_ipex" ]; then
-        model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
-    fi
 
     if [[ ${bf16} == "true" ]]; then
         extra_cmd=$extra_cmd" --bf16_ipex"
     fi
+    if [ "${tasks}" != "" ]; then
+	extra_cmd=$extra_cmd" --tasks ${tasks}"
+    fi
 
     echo $extra_cmd
 
     if [ ${script} = "evaluate_clm.py" ];then
         python -u ./${script} \
             --model ${model_name_or_path} \
             --output_dir ${tuned_checkpoint} \
-	    --batch_size ${batch_size} \
+            --batch_size ${batch_size} \
             ${mode_cmd} \
             ${extra_cmd}
     elif [ -z ${DATASET_CONFIG_NAME} ];then

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
@@ -18,6 +18,7 @@ function init_params {
   batch_size=8
   model_type="bert"
   approach="PostTrainingStatic"
+  alpha=0.5
   for var in "$@"
   do
     case $var in
@@ -85,9 +86,27 @@ function run_tuning {
         approach="PostTrainingDynamic"
     elif [ "${topology}" = "gpt_j_6b_clm_ipex" ]; then
         script="evaluate_clm.py"
-	DATASET_NAME="lambada"
+        DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
         approach="PostTrainingStatic"
+    elif [ "${topology}" = "opt_2.7b_clm_ipex" ]; then
+        script="evaluate_clm.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="facebook/opt-2.7b"
+        approach="PostTrainingStatic"
+	extra_cmd=$extra_cmd" --int8_bf16_mixed"
+    elif [ "${topology}" = "opt_6.7b_clm_ipex" ]; then
+        script="evaluate_clm.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="facebook/opt-6.7b"
+        approach="PostTrainingStatic"
+	extra_cmd=$extra_cmd" --int8_bf16_mixed"
+    elif [ "${topology}" = "llama_7b_clm_ipex" ]; then
+        script="evaluate_clm.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="decapoda-research/llama-7b-hf"
+        approach="PostTrainingStatic"
+	extra_cmd=$extra_cmd" --int8_bf16_mixed"
     elif [ "${topology}" = "bert_mlm_static" ]; then
         script="run_mlm.py"
         DATASET_NAME="wikitext"
@@ -179,10 +198,10 @@ function run_tuning {
         python -u ./${script} \
             --model ${model_name_or_path} \
             --output_dir ${tuned_checkpoint} \
-	    --dataset ${DATASET_NAME} \
+            --dataset ${DATASET_NAME} \
             --quantize \
-	    --sq \
-	    --alpha 0.7 \
+            --sq \
+            --alpha ${alpha} \
             ${extra_cmd}
     elif [ -z ${DATASET_CONFIG_NAME} ];then
         python -u ./${script} \