Add MPT/Falcon example for CLM accuracy (#1077)

intel · Jun 28, 2023 · f6ca74d · f6ca74d
1 parent 6562b93
commit f6ca74d
Show file tree

Hide file tree

Showing 23 changed files with 3,473 additions and 15 deletions.
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -1437,6 +1437,58 @@
         "config": "saved_results"
       }
     }
+  },
+  "mpt_7b_chat_clm_ipex": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "mpt_7b_chat",
+        "task": "clm",
+        "approach": "static",
+        "backend": "ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "mpt_7b_chat",
+        "task": "clm",
+        "approach": "static",
+        "backend": "ipex",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
+  "falcon_7b_instruct_clm": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "falcon_7b_instruct",
+        "task": "clm",
+        "approach": "static",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "falcon_7b_instruct",
+        "task": "clm",
+        "approach": "static",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
   },
    "opt_1.3b_clm_ipex": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",

diff --git a/examples/huggingface/pytorch/language-modeling/README.md b/examples/huggingface/pytorch/language-modeling/README.md
@@ -139,3 +139,37 @@ We provide FP32/BF16 inference, INT8 inference, and other advanced compression t
   </tr>
 </tbody>
 </table>
+
+
+# Purpose of the Language Models Optimization for Intel Architecture 
+
+
+- Demonstrate the AI workloads and deep learning models Intel has optimized and validated to run on Intel hardware 
+
+- Show how to efficiently execute, train, and deploy Intel-optimized models 
+
+- Make it easy to get started running Intel-optimized models on Intel hardware in the cloud or on bare metal 
+
+
+
+DISCLAIMER: These scripts are not intended for benchmarking Intel platforms. For any performance and/or benchmarking information on specific Intel platforms, visit https://www.intel.ai/blog. 
+
+
+
+Intel is committed to the respect of human rights and avoiding complicity in human rights abuses, a policy reflected in the Intel Global Human Rights Principles. Accordingly, by accessing the Intel material on this platform you agree that you will not use the material in a product or application that causes or contributes to a violation of an internationally recognized human right. 
+
+
+
+## Models 
+
+To the extent that any model(s) are referenced by Intel or accessed using tools or code on this site those models are provided by the third party indicated as the source.  Intel does not create the model(s) and does not warrant their accuracy or quality.  You understand that you are responsible for understanding the terms of use and that your use complies with the applicable license. 
+
+
+
+## Datasets 
+
+To the extent that any public or datasets are referenced by Intel or accessed using tools or code on this site those items are provided by the third party indicated as the source of the data. Intel does not create the data, or datasets, and does not warrant their accuracy or quality. By accessing the public dataset(s) you agree to the terms associated with those datasets and that your use complies with the applicable license. 
+
+
+
+Intel expressly disclaims the accuracy, adequacy, or completeness of any public datasets, and is not liable for any errors, omissions, or defects in the data, or for any reliance on the data.  Intel is not liable for any liability or damages relating to your use of public datasets. 
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md
@@ -4,7 +4,7 @@ This document describes the step-by-step instructions to run large language mode
 
 The scripts `run_clm.py`, `run_mlm.py` and `run_plm.py` provide three quantization approaches respectively (PostTrainingDynamic, PostTrainingStatic, QuantAwareTraining) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) and return last token prediction accuracy by `trainer`.
 
-The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA`, `BLOOM` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA`, `BLOOM`, `MPT` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
 
 # Prerequisite
 ## 1. Create Environment
@@ -33,7 +33,6 @@ Here is how to run the scripts:
 python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
     --quantize \
-    --dataset NeelNanda/pile-10k \
     --sq \
     --alpha 1.0 \
     --output_dir "saved_results" \
@@ -63,7 +62,6 @@ python run_clm_no_trainer.py \
 python run_clm_no_trainer.py \
     --model facebook/opt-2.7b \
     --quantize \
-    --dataset NeelNanda/pile-10k \
     --sq \
     --alpha 0.5 \
     --ipex \
@@ -93,7 +91,6 @@ python run_clm_no_trainer.py \
 python run_clm_no_trainer.py \
     --model decapoda-research/llama-7b-hf \
     --quantize \
-    --dataset NeelNanda/pile-10k \
     --sq \
     --alpha 0.8 \
     --ipex \
@@ -113,6 +110,64 @@ python run_clm_no_trainer.py \
     --output_dir "saved_results"  # load int8 model
 # to validate FP32 model, please remove "--int8" and "--output_dir".
 ```
+
+### MPT-7b-chat
+#### Quantization
+`mosaicml/mpt-7b-chat` has been updated frequently, and has not yet been integrated into `transformers`, so we fixed a commit number to enable it.
+```bash
+# "--sq" is used to enable smooth quant
+# "--int8_bf16_mixed" is used to enable int8-bf16 mixed mode for platform that natively supports bf16
+python run_clm_no_trainer.py \
+    --model mosaicml/mpt-7b-chat \
+    --revision c8d4750ac8421303665d6ecc253950c69b56d324 \
+    --quantize \
+    --sq \
+    --alpha 0.85 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+#### Accuracy with lm_eval
+```bash
+python run_clm_no_trainer.py \
+    --model mosaicml/mpt-7b-chat \
+    --accuracy_only \
+    --batch_size 112 \
+    --tasks  "lambada_openai" \
+    --int8 \
+    --ipex \
+    --output_dir "saved_results"  # load int8 model
+# to validate FP32 model, please remove "--int8" and "--output_dir".
+```
+### Falcon-7b-instruct
+#### Quantization
+`tiiuae/falcon-7b-instruct` has been updated frequently, and has not yet been integrated into `transformers`, so we fixed a commit number to enable it.
+```bash
+# "--sq" is used to enable smooth quant
+# "--int8_bf16_mixed" is used to enable int8-bf16 mixed mode for platform that natively supports bf16
+python run_clm_no_trainer.py \
+    --model tiiuae/falcon-7b-instruct \
+    --revision  c7f670a03d987254220f343c6b026ea0c5147185 \
+    --quantize \
+    --sq \
+    --alpha 0.7 \
+    --output_dir "saved_results"
+```
+
+#### Accuracy with lm_eval
+```bash
+python run_clm_no_trainer.py \
+    --model tiiuae/falcon-7b-instruct \
+    --accuracy_only \
+    --batch_size 112 \
+    --tasks  "lambada_openai" \
+    --int8 \
+    --ipex \
+    --output_dir "saved_results"  # load int8 model
+# to validate FP32 model, please remove "--int8" and "--output_dir".
+```
+
+
 To do quantization based transformers language-modeling example [`run_clm.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py), please use the following command.
 ```bash
 python run_clm.py \
@@ -157,3 +212,4 @@ python run_mlm.py \
     --overwrite_output_dir
 ```
 
+
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/falcon_7b_instruct/__init__.py b/examples/huggingface/pytorch/language-modeling/quantization/falcon_7b_instruct/__init__.py
diff --git a/...huggingface/pytorch/language-modeling/quantization/falcon_7b_instruct/configuration_RW.py b/...huggingface/pytorch/language-modeling/quantization/falcon_7b_instruct/configuration_RW.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Bloom configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class RWConfig(PretrainedConfig):
+    model_type = "RefinedWebModel"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+    }
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=64,
+        n_layer=2,
+        n_head=8,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        apply_residual_connection_post_layernorm=False,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        multi_query=False,
+        alibi=False,
+        bias=False,
+        parallel_attn=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.multi_query = multi_query
+        self.alibi = alibi
+        self.bias = bias
+        self.parallel_attn = parallel_attn
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.n_head
+
+    @property
+    def rotary(self):
+        return not self.alibi