Support different backends for adaptors (#157)

Signed-off-by: Mengni Wang <mengni.wang@intel.com> Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com> Signed-off-by: Lv, Liang1 <liang1.lv@intel.com> Co-authored-by: Cheng, Penghui <penghui.cheng@intel.com> Co-authored-by: Lv, Liang1 <liang1.lv@intel.com>
intel · Dec 9, 2022 · 2483a84 · 2483a84
1 parent 47b51ed
commit 2483a84
Show file tree

Hide file tree

Showing 89 changed files with 1,276 additions and 1,938 deletions.
diff --git a/examples/mxnet/image_recognition/cnn_models/quantization/ptq/imagenet_inference.py b/examples/mxnet/image_recognition/cnn_models/quantization/ptq/imagenet_inference.py
@@ -26,7 +26,7 @@
 from mxnet.gluon.data import DataLoader
 from mxnet.gluon.data.vision import transforms
 
-from neural_compressor.adaptor.mxnet_utils.util import check_mx_version, get_backend_name
+from neural_compressor.adaptor.mxnet_utils.util import check_mx_version, get_framework_name
 
 if check_mx_version('2.0.0') or not check_mx_version('1.7.0'):  # version >= 2.0.0 or == 1.6.0
     from mxnet.contrib.quantization import quantize_net
@@ -82,7 +82,7 @@ def quantize(net, ctx, dataloader, batch_size, num_calib_batches, save_path, cal
 
     data = next(iter(dataloader))[0].as_in_context(ctx)
     if check_mx_version('1.7.0'):
-        qnet.optimize_for(data, backend=get_backend_name(ctx), static_alloc=True, static_shape=True)
+        qnet.optimize_for(data, backend=get_framework_name(ctx), static_alloc=True, static_shape=True)
     qnet.export(save_path, 0)
     logger.info('Saved quantized model to: {}'.format(save_path))
 

diff --git a/examples/onnxrt/image_recognition/onnx_model_zoo/fcn/quantization/ptq/main.py b/examples/onnxrt/image_recognition/onnx_model_zoo/fcn/quantization/ptq/main.py
@@ -175,7 +175,9 @@ def iou(model_tensor, target_tensor):
 
 def evaluate(model, dataloader):    
     totalIoU = 0
-    sess = onnxruntime.InferenceSession(model.SerializeToString(), None)
+    sess = onnxruntime.InferenceSession(model.SerializeToString(),
+                                        None,
+                                        providers=onnxruntime.get_available_providers())
     idx = 1
     for input_tensor, target_tensor in dataloader:
         input_tensor = input_tensor[np.newaxis, ...]

diff --git a/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/main.py
@@ -21,7 +21,8 @@
 max_answer_length = 30
 
 def parse_dummy_input(model, benchmark_nums, max_seq_length):
-    session = onnxruntime.InferenceSession(model.SerializeToString(), None)
+    session = onnxruntime.InferenceSession(model.SerializeToString(), None,
+        providers=onnxruntime.get_available_providers())
     shapes = []
     lows = []
     highs = []
@@ -63,7 +64,8 @@ def __len__(self):
         return len(self.input_ids)
 
 def evaluate_squad(model, dataloader, input_ids, eval_examples, extra_data, input_file):
-    session = onnxruntime.InferenceSession(model.SerializeToString(), None)
+    session = onnxruntime.InferenceSession(model.SerializeToString(), None,
+        providers=onnxruntime.get_available_providers())
     for output_meta in session.get_outputs():
         print(output_meta)
     for input_meta in session.get_inputs():

diff --git a/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py b/examples/onnxrt/nlp/onnx_model_zoo/bert-squad/quantization/ptq/run_onnx_squad.py
@@ -538,7 +538,7 @@ def main():
         convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                      args.doc_stride, args.max_query_length)
 
-    sess = onnxrt.InferenceSession(args.model, sess_options)
+    sess = onnxrt.InferenceSession(args.model, sess_options, providers=onnxrt.get_available_providers())
     for input_meta in sess.get_inputs():
         print(input_meta)
     n = len(input_ids)

diff --git a/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py b/examples/onnxrt/nlp/onnx_model_zoo/gpt2/quantization/ptq/gpt2.py
@@ -115,7 +115,8 @@ def evaluate(args, model, tokenizer, prefix=""):
     total_time = 0.0
 
     options = ort.SessionOptions()
-    session = ort.InferenceSession(model.SerializeToString(), options)
+    session = ort.InferenceSession(model.SerializeToString(), options,
+        providers=ort.get_available_providers())
     len_outputs = len(session.get_outputs())
     len_inputs = len(session.get_inputs())
     inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]

diff --git a/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/main.py b/examples/onnxrt/nlp/onnx_model_zoo/mobilebert/quantization/ptq/main.py
@@ -19,7 +19,8 @@
 max_answer_length = 30
 
 def parse_dummy_input(model, benchmark_nums, max_seq_length):
-    session = onnxruntime.InferenceSession(model.SerializeToString(), None)
+    session = onnxruntime.InferenceSession(model.SerializeToString(), None,
+        providers=onnxruntime.get_available_providers())
     shapes = []
     lows = []
     highs = []
@@ -55,7 +56,8 @@ def __len__(self):
         return len(self.input_ids)
 
 def evaluate_squad(model, dataloader, input_ids, eval_examples, extra_data, input_file):
-    session = onnxruntime.InferenceSession(model.SerializeToString(), None)
+    session = onnxruntime.InferenceSession(model.SerializeToString(), None,
+        providers=onnxruntime.get_available_providers())
     for output_meta in session.get_outputs():
         print(output_meta)
     for input_meta in session.get_inputs():

diff --git a/...huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md b/...huggingface_models/text-classification/quantization/ptq_dynamic/eager/README.md
@@ -4,10 +4,6 @@ Step-by-Step
 This document is used to list steps of reproducing PyTorch BERT tuning zoo result.
 Original BERT documents please refer to [BERT README](../../../../common/README.md) and [README](../../../../common/examples/text-classification/README.md).
 
-> **Note**
->
-> Dynamic Quantization is the recommended method for huggingface models. 
-
 # Prerequisite
 
 ## 1. Installation
@@ -57,7 +53,7 @@ sh run_tuning.sh --topology=topology_name --dataset_location=/path/to/glue/data/
 or
 
 ```bash
-python -u ./run_glue_tune.py \
+python -u ./run_glue.py \
         --model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \
         --task_name sst2 \
         --do_eval \
@@ -73,7 +69,7 @@ python -u ./run_glue_tune.py \
 ### 2. To get the benchmark of tuned model, includes batch_size and throughput: 
 
 ```bash
-python -u ./run_glue_tune.py \
+python -u ./run_glue.py \
         --model_name_or_path ./int8_model_dir \
         --task_name sst2 \
         --do_eval \
@@ -158,7 +154,7 @@ Here we set accuracy target as tolerating 0.01 relative accuracy loss of baselin
 
 ### Code Prepare
 
-We just need update run_squad_tune.py and run_glue_tune.py like below
+We just need update run_glue.py like below
 
 ```python
 if model_args.tune:
@@ -195,7 +191,7 @@ if model_args.tune:
 ### Using Shapley MSE as Objective
 
 Shapley values originate from cooperative game theory that come with desirable properties, and now are widely used as a tool to fulfill Explainable AI. The run_glue_tune_with_shap.py is designed to help build a bert-based model using Shapley MSE as an objective. Here, the Shapley MSE means that we can get one result from FP32 and several results from INT8 model, so we use MSE to calculate how different between the two shapley values. It can reflect the explainability of INT8 model. 
-> **Note** : run_glue_tune_with_shap.py is the example of "SST2" task. If you want to execute other glue task, you may take some slight change under "ShapleyMSE" class.  
+> **Note** : run_glue_with_shap.py is the example of "SST2" task. If you want to execute other glue task, you may take some slight change under "ShapleyMSE" class.  
 
 
 # Appendix

diff --git a/...lp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_benchmark.sh b/...lp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_benchmark.sh
@@ -110,7 +110,7 @@ function run_benchmark {
     fi
     echo $extra_cmd
 
-    python -u run_glue_tune.py \
+    python -u run_glue.py \
         --task_name ${TASK_NAME} \
         --do_eval \
         --max_seq_length ${MAX_SEQ_LENGTH} \

diff --git a/...zation/ptq_dynamic/eager/run_glue_tune.py → ...uantization/ptq_dynamic/eager/run_glue.py b/...zation/ptq_dynamic/eager/run_glue_tune.py → ...uantization/ptq_dynamic/eager/run_glue.py
@@ -439,11 +439,11 @@ def eval_func_for_nc(model_tuned):
                     acc = result[key]
                     break
             return acc
-        from neural_compressor.quantization import fit
-        from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
-        tuning_criterion = TuningCriterion(max_trials=600)
-        conf = PostTrainingQuantConfig(approach="dynamic", backend="pytorch", tuning_criterion=tuning_criterion)
-        q_model = fit(model, conf=conf, eval_func=eval_func_for_nc)
+        from neural_compressor.experimental import Quantization, common
+        quantizer = Quantization("./conf.yaml")
+        quantizer.model = common.Model(model)
+        quantizer.eval_func = eval_func_for_nc
+        q_model = quantizer.fit()
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
         save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
 

diff --git a/..._dynamic/eager/run_glue_tune_with_shap.py → ...n/ptq_dynamic/eager/run_glue_with_shap.py b/..._dynamic/eager/run_glue_tune_with_shap.py → ...n/ptq_dynamic/eager/run_glue_with_shap.py
diff --git a/...h/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh b/...h/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/eager/run_tuning.sh
@@ -76,7 +76,7 @@ function run_tuning {
     sed -i "/: bert/s|name:.*|name: $model_type|g" conf.yaml
     sed -i "/approach:/s|approach:.*|approach: $approach|g" conf.yaml
 
-    python -u ./run_glue_tune.py \
+    python -u ./run_glue.py \
         --model_name_or_path ${model_name_or_path} \
         --task_name ${TASK_NAME} \
         --do_eval \

diff --git a/...lp/huggingface_models/text-classification/quantization/ptq_dynamic/fx/README.md b/...lp/huggingface_models/text-classification/quantization/ptq_dynamic/fx/README.md
@@ -106,43 +106,12 @@ We also upstreamed several int8 models into HuggingFace [model hub](https://hugg
 ## This is a tutorial of how to enable NLP model with Intel® Neural Compressor.
 
 
-### Intel® Neural Compressor supports two usages:
-
-1. User specifies fp32 'model', calibration dataset 'q_dataloader', evaluation dataset "eval_dataloader" and metrics in tuning.metrics field of model-specific yaml config file.
-2. User specifies fp32 'model', calibration dataset 'q_dataloader' and a custom "eval_func" which encapsulates the evaluation dataset and metrics by itself.
-
-As MRPC's metrics are 'f1', 'acc_and_f1', mcc', 'spearmanr', 'acc', so customer should provide evaluation function 'eval_func', it's suitable for the second use case.
-
-### Write Yaml config file
-
-In examples directory, there is conf.yaml. We could remove most of the items and only keep mandatory item for tuning.
-
-```yaml
-model:
-  name: bert
-  framework: pytorch_fx
-
-device: cpu
-
-quantization:
-  approach: post_training_dynamic_quant
-
-tuning:
-  accuracy_criterion:
-    relative: 0.01
-  exit_policy:
-    timeout: 0
-    max_trials: 300
-  random_seed: 9527
-```
-
-Here we set accuracy target as tolerating 0.01 relative accuracy loss of baseline. The default tuning strategy is basic strategy. The timeout 0 means early stop as well as a tuning config meet accuracy target.
-
-> **Note** : neural_compressor does NOT support "mse" tuning strategy for pytorch framework
+### Intel® Neural Compressor supports usage:
+* User specifies fp32 'model', calibration dataset 'q_dataloader' and a custom "eval_func" which encapsulates the evaluation dataset and metrics by itself.
 
 ### Code Prepare
 
-We just need update run_squad_tune.py and run_glue.py like below
+We just need update run_glue.py like below
 
 ```python
 trainer = Trainer(
@@ -170,20 +139,10 @@ def take_eval_steps(model, trainer, metric_name, save_metrics=False):
 def eval_func(model):
     return take_eval_steps(model, trainer, metric_name)
 
-from neural_compressor.experimental import Quantization, common
-if (
-    not training_args.dataloader_drop_last
-    and eval_dataset.shape[0] % training_args.per_device_eval_batch_size != 0
-):
-    raise ValueError(
-        "The number of samples of the dataset is not a multiple of the batch size."
-        "Use --dataloader_drop_last to overcome."
-    )
-calib_dataloader = eval_dataloader
-quantizer = Quantization('conf.yaml')
-quantizer.eval_func = eval_func
-quantizer.calib_dataloader = calib_dataloader
-quantizer.model = common.Model(model)
-model = quantizer.fit()
-model.save(training_args.output_dir)
+from neural_compressor.quantization import fit
+from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
+tuning_criterion = TuningCriterion(max_trials=600)
+conf = PostTrainingQuantConfig(approach="dynamic", backend="pytorch",
+                               tuning_criterion=tuning_criterion)
+q_model = fit(model, conf=conf, eval_func=eval_func)
 ```
diff --git a/.../pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/fx/conf.yaml b/.../pytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/fx/conf.yaml
@@ -28,4 +28,4 @@ tuning:
   exit_policy:
     timeout: 0                                 # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit.
     max_trials: 600
-  random_seed:  9527                                 # optional. random seed for deterministic tuning.
+  random_seed:  9527                                 # optional. random seed for deterministic tuning.
diff --git a/...ytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/fx/run_glue.py b/...ytorch/nlp/huggingface_models/text-classification/quantization/ptq_dynamic/fx/run_glue.py
@@ -498,15 +498,13 @@ def eval_func(model):
 
     # optimize and quantize with Neural Compressor
     if model_args.tune:
-        from neural_compressor.experimental import Quantization, common
-        calib_dataloader = eval_dataloader
-        quantizer = Quantization('conf.yaml')
-        quantizer.eval_func = eval_func
-        quantizer.calib_dataloader = calib_dataloader
-        quantizer.model = common.Model(model)
-        model = quantizer.fit()
+        from neural_compressor.quantization import fit
+        from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
+        tuning_criterion = TuningCriterion(max_trials=600)
+        conf = PostTrainingQuantConfig(approach="dynamic", backend="pytorch", tuning_criterion=tuning_criterion)
+        q_model = fit(model, conf=conf, eval_func=eval_func)
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
-        save_for_huggingface_upstream(model, tokenizer, training_args.output_dir)
+        save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
         return
 
     if model_args.benchmark or model_args.accuracy_only:

diff --git a/...nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md b/...nlp/huggingface_models/text-classification/quantization/ptq_static/fx/README.md
@@ -106,43 +106,12 @@ We also upstreamed several int8 models into HuggingFace [model hub](https://hugg
 ## This is a tutorial of how to enable NLP model with Intel® Neural Compressor.
 
 
-### Intel® Neural Compressor supports two usages:
-
-1. User specifies fp32 'model', calibration dataset 'q_dataloader', evaluation dataset "eval_dataloader" and metrics in tuning.metrics field of model-specific yaml config file.
-2. User specifies fp32 'model', calibration dataset 'q_dataloader' and a custom "eval_func" which encapsulates the evaluation dataset and metrics by itself.
-
-As MRPC's metrics are 'f1', 'acc_and_f1', mcc', 'spearmanr', 'acc', so customer should provide evaluation function 'eval_func', it's suitable for the second use case.
-
-### Write Yaml config file
-
-In examples directory, there is conf.yaml. We could remove most of the items and only keep mandatory item for tuning.
-
-```yaml
-model:
-  name: bert
-  framework: pytorch_fx
-
-device: cpu
-
-quantization:
-  approach: post_training_static_quant
-
-tuning:
-  accuracy_criterion:
-    relative: 0.01
-  exit_policy:
-    timeout: 0
-    max_trials: 300
-  random_seed: 9527
-```
-
-Here we set accuracy target as tolerating 0.01 relative accuracy loss of baseline. The default tuning strategy is basic strategy. The timeout 0 means early stop as well as a tuning config meet accuracy target.
-
-> **Note** : neural_compressor does NOT support "mse" tuning strategy for pytorch framework
+### Intel® Neural Compressor supports usage:
+* User specifies fp32 'model', calibration dataset 'q_dataloader' and a custom "eval_func" which encapsulates the evaluation dataset and metrics by itself.
 
 ### Code Prepare
 
-We just need update run_squad_tune.py and run_glue.py like below
+We just need update run_glue.py like below
 
 ```python
 trainer = Trainer(
@@ -170,22 +139,13 @@ def take_eval_steps(model, trainer, metric_name, save_metrics=False):
 def eval_func(model):
     return take_eval_steps(model, trainer, metric_name)
 
-from neural_compressor.experimental import Quantization, common
-if (
-    not training_args.dataloader_drop_last
-    and eval_dataset.shape[0] % training_args.per_device_eval_batch_size != 0
-):
-    raise ValueError(
-        "The number of samples of the dataset is not a multiple of the batch size."
-        "Use --dataloader_drop_last to overcome."
-    )
-calib_dataloader = eval_dataloader
-quantizer = Quantization('conf.yaml')
-quantizer.eval_func = eval_func
-quantizer.calib_dataloader = calib_dataloader
-quantizer.model = common.Model(model)
-model = quantizer.fit()
-model.save(training_args.output_dir)
+from neural_compressor.quantization import fit
+from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
+tuning_criterion = TuningCriterion(max_trials=600)
+conf = PostTrainingQuantConfig(approach="static", tuning_criterion=tuning_criterion)
+q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
+from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
+save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
 ```
 
 # Appendix

diff --git a/...s/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/conf.yaml b/...s/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/conf.yaml
@@ -28,4 +28,4 @@ tuning:
   exit_policy:
     timeout: 0                                       # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit.
     max_trials: 600
-  random_seed: 9527                                  # optional. random seed for deterministic tuning.
+  random_seed: 9527                                  # optional. random seed for deterministic tuning.
diff --git a/...pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py b/...pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx/run_glue.py
@@ -504,7 +504,7 @@ def eval_func(model):
         from neural_compressor.quantization import fit
         from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
         tuning_criterion = TuningCriterion(max_trials=600)
-        conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion)
+        conf = PostTrainingQuantConfig(approach="static", tuning_criterion=tuning_criterion)
         q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
         from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
         save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)

diff --git a/...ytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md b/...ytorch/nlp/huggingface_models/text-classification/quantization/qat/fx/README.md
@@ -38,9 +38,7 @@ PyTorch 1.8 is needed for pytorch_fx backend and huggingface/transformers.
 ### 1. Enable bert-base-cased/uncased example with the auto quantization aware training strategy of Neural Compressor.
 
   The changes made are as follows:
-  1. add conf_qat.yaml:  
-    This file contains the configuration of quantization.  
-  2. edit run_glue_tune.py:  
+  * edit run_glue.py:  
     - For quantization, We used neural_compressor in it.  
     - For training, we enbaled early stop strategy.  
 
@@ -50,7 +48,7 @@ PyTorch 1.8 is needed for pytorch_fx backend and huggingface/transformers.
 
 or
 
-    python run_glue_tune.py \
+    python run_glue.py \
         --model_name_or_path ${input_model} \
         --task_name ${task_name} \
         --do_train \
@@ -77,7 +75,7 @@ or
 
 or
 
-    python run_glue_tune.py \
+    python run_glue.py \
         --model_name_or_path ${input_model}/${tuned_checkpoint} \
         --task_name ${task_name} \
         --do_train \