Skip to content

Commit

Permalink
Support different backends for adaptors (#157)
Browse files Browse the repository at this point in the history
Signed-off-by: Mengni Wang <mengni.wang@intel.com>
Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
Co-authored-by: Cheng, Penghui <penghui.cheng@intel.com>
Co-authored-by: Lv, Liang1 <liang1.lv@intel.com>
  • Loading branch information
3 people committed Dec 9, 2022
1 parent 47b51ed commit 2483a84
Show file tree
Hide file tree
Showing 89 changed files with 1,276 additions and 1,938 deletions.
Expand Up @@ -26,7 +26,7 @@
from mxnet.gluon.data import DataLoader
from mxnet.gluon.data.vision import transforms

from neural_compressor.adaptor.mxnet_utils.util import check_mx_version, get_backend_name
from neural_compressor.adaptor.mxnet_utils.util import check_mx_version, get_framework_name

if check_mx_version('2.0.0') or not check_mx_version('1.7.0'): # version >= 2.0.0 or == 1.6.0
from mxnet.contrib.quantization import quantize_net
Expand Down Expand Up @@ -82,7 +82,7 @@ def quantize(net, ctx, dataloader, batch_size, num_calib_batches, save_path, cal

data = next(iter(dataloader))[0].as_in_context(ctx)
if check_mx_version('1.7.0'):
qnet.optimize_for(data, backend=get_backend_name(ctx), static_alloc=True, static_shape=True)
qnet.optimize_for(data, backend=get_framework_name(ctx), static_alloc=True, static_shape=True)
qnet.export(save_path, 0)
logger.info('Saved quantized model to: {}'.format(save_path))

Expand Down
Expand Up @@ -175,7 +175,9 @@ def iou(model_tensor, target_tensor):

def evaluate(model, dataloader):
totalIoU = 0
sess = onnxruntime.InferenceSession(model.SerializeToString(), None)
sess = onnxruntime.InferenceSession(model.SerializeToString(),
None,
providers=onnxruntime.get_available_providers())
idx = 1
for input_tensor, target_tensor in dataloader:
input_tensor = input_tensor[np.newaxis, ...]
Expand Down
Expand Up @@ -21,7 +21,8 @@
max_answer_length = 30

def parse_dummy_input(model, benchmark_nums, max_seq_length):
session = onnxruntime.InferenceSession(model.SerializeToString(), None)
session = onnxruntime.InferenceSession(model.SerializeToString(), None,
providers=onnxruntime.get_available_providers())
shapes = []
lows = []
highs = []
Expand Down Expand Up @@ -63,7 +64,8 @@ def __len__(self):
return len(self.input_ids)

def evaluate_squad(model, dataloader, input_ids, eval_examples, extra_data, input_file):
session = onnxruntime.InferenceSession(model.SerializeToString(), None)
session = onnxruntime.InferenceSession(model.SerializeToString(), None,
providers=onnxruntime.get_available_providers())
for output_meta in session.get_outputs():
print(output_meta)
for input_meta in session.get_inputs():
Expand Down
Expand Up @@ -538,7 +538,7 @@ def main():
convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
args.doc_stride, args.max_query_length)

sess = onnxrt.InferenceSession(args.model, sess_options)
sess = onnxrt.InferenceSession(args.model, sess_options, providers=onnxrt.get_available_providers())
for input_meta in sess.get_inputs():
print(input_meta)
n = len(input_ids)
Expand Down
Expand Up @@ -115,7 +115,8 @@ def evaluate(args, model, tokenizer, prefix=""):
total_time = 0.0

options = ort.SessionOptions()
session = ort.InferenceSession(model.SerializeToString(), options)
session = ort.InferenceSession(model.SerializeToString(), options,
providers=ort.get_available_providers())
len_outputs = len(session.get_outputs())
len_inputs = len(session.get_inputs())
inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
Expand Down
Expand Up @@ -19,7 +19,8 @@
max_answer_length = 30

def parse_dummy_input(model, benchmark_nums, max_seq_length):
session = onnxruntime.InferenceSession(model.SerializeToString(), None)
session = onnxruntime.InferenceSession(model.SerializeToString(), None,
providers=onnxruntime.get_available_providers())
shapes = []
lows = []
highs = []
Expand Down Expand Up @@ -55,7 +56,8 @@ def __len__(self):
return len(self.input_ids)

def evaluate_squad(model, dataloader, input_ids, eval_examples, extra_data, input_file):
session = onnxruntime.InferenceSession(model.SerializeToString(), None)
session = onnxruntime.InferenceSession(model.SerializeToString(), None,
providers=onnxruntime.get_available_providers())
for output_meta in session.get_outputs():
print(output_meta)
for input_meta in session.get_inputs():
Expand Down
Expand Up @@ -4,10 +4,6 @@ Step-by-Step
This document is used to list steps of reproducing PyTorch BERT tuning zoo result.
Original BERT documents please refer to [BERT README](../../../../common/README.md) and [README](../../../../common/examples/text-classification/README.md).

> **Note**
>
> Dynamic Quantization is the recommended method for huggingface models.
# Prerequisite

## 1. Installation
Expand Down Expand Up @@ -57,7 +53,7 @@ sh run_tuning.sh --topology=topology_name --dataset_location=/path/to/glue/data/
or

```bash
python -u ./run_glue_tune.py \
python -u ./run_glue.py \
--model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \
--task_name sst2 \
--do_eval \
Expand All @@ -73,7 +69,7 @@ python -u ./run_glue_tune.py \
### 2. To get the benchmark of tuned model, includes batch_size and throughput:

```bash
python -u ./run_glue_tune.py \
python -u ./run_glue.py \
--model_name_or_path ./int8_model_dir \
--task_name sst2 \
--do_eval \
Expand Down Expand Up @@ -158,7 +154,7 @@ Here we set accuracy target as tolerating 0.01 relative accuracy loss of baselin
### Code Prepare

We just need update run_squad_tune.py and run_glue_tune.py like below
We just need update run_glue.py like below

```python
if model_args.tune:
Expand Down Expand Up @@ -195,7 +191,7 @@ if model_args.tune:
### Using Shapley MSE as Objective

Shapley values originate from cooperative game theory that come with desirable properties, and now are widely used as a tool to fulfill Explainable AI. The run_glue_tune_with_shap.py is designed to help build a bert-based model using Shapley MSE as an objective. Here, the Shapley MSE means that we can get one result from FP32 and several results from INT8 model, so we use MSE to calculate how different between the two shapley values. It can reflect the explainability of INT8 model.
> **Note** : run_glue_tune_with_shap.py is the example of "SST2" task. If you want to execute other glue task, you may take some slight change under "ShapleyMSE" class.
> **Note** : run_glue_with_shap.py is the example of "SST2" task. If you want to execute other glue task, you may take some slight change under "ShapleyMSE" class.

# Appendix
Expand Down
Expand Up @@ -110,7 +110,7 @@ function run_benchmark {
fi
echo $extra_cmd

python -u run_glue_tune.py \
python -u run_glue.py \
--task_name ${TASK_NAME} \
--do_eval \
--max_seq_length ${MAX_SEQ_LENGTH} \
Expand Down
Expand Up @@ -439,11 +439,11 @@ def eval_func_for_nc(model_tuned):
acc = result[key]
break
return acc
from neural_compressor.quantization import fit
from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
tuning_criterion = TuningCriterion(max_trials=600)
conf = PostTrainingQuantConfig(approach="dynamic", backend="pytorch", tuning_criterion=tuning_criterion)
q_model = fit(model, conf=conf, eval_func=eval_func_for_nc)
from neural_compressor.experimental import Quantization, common
quantizer = Quantization("./conf.yaml")
quantizer.model = common.Model(model)
quantizer.eval_func = eval_func_for_nc
q_model = quantizer.fit()
from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)

Expand Down
Expand Up @@ -76,7 +76,7 @@ function run_tuning {
sed -i "/: bert/s|name:.*|name: $model_type|g" conf.yaml
sed -i "/approach:/s|approach:.*|approach: $approach|g" conf.yaml

python -u ./run_glue_tune.py \
python -u ./run_glue.py \
--model_name_or_path ${model_name_or_path} \
--task_name ${TASK_NAME} \
--do_eval \
Expand Down
Expand Up @@ -106,43 +106,12 @@ We also upstreamed several int8 models into HuggingFace [model hub](https://hugg
## This is a tutorial of how to enable NLP model with Intel® Neural Compressor.


### Intel® Neural Compressor supports two usages:

1. User specifies fp32 'model', calibration dataset 'q_dataloader', evaluation dataset "eval_dataloader" and metrics in tuning.metrics field of model-specific yaml config file.
2. User specifies fp32 'model', calibration dataset 'q_dataloader' and a custom "eval_func" which encapsulates the evaluation dataset and metrics by itself.

As MRPC's metrics are 'f1', 'acc_and_f1', mcc', 'spearmanr', 'acc', so customer should provide evaluation function 'eval_func', it's suitable for the second use case.

### Write Yaml config file

In examples directory, there is conf.yaml. We could remove most of the items and only keep mandatory item for tuning.

```yaml
model:
name: bert
framework: pytorch_fx

device: cpu

quantization:
approach: post_training_dynamic_quant

tuning:
accuracy_criterion:
relative: 0.01
exit_policy:
timeout: 0
max_trials: 300
random_seed: 9527
```

Here we set accuracy target as tolerating 0.01 relative accuracy loss of baseline. The default tuning strategy is basic strategy. The timeout 0 means early stop as well as a tuning config meet accuracy target.

> **Note** : neural_compressor does NOT support "mse" tuning strategy for pytorch framework
### Intel® Neural Compressor supports usage:
* User specifies fp32 'model', calibration dataset 'q_dataloader' and a custom "eval_func" which encapsulates the evaluation dataset and metrics by itself.

### Code Prepare

We just need update run_squad_tune.py and run_glue.py like below
We just need update run_glue.py like below

```python
trainer = Trainer(
Expand Down Expand Up @@ -170,20 +139,10 @@ def take_eval_steps(model, trainer, metric_name, save_metrics=False):
def eval_func(model):
return take_eval_steps(model, trainer, metric_name)

from neural_compressor.experimental import Quantization, common
if (
not training_args.dataloader_drop_last
and eval_dataset.shape[0] % training_args.per_device_eval_batch_size != 0
):
raise ValueError(
"The number of samples of the dataset is not a multiple of the batch size."
"Use --dataloader_drop_last to overcome."
)
calib_dataloader = eval_dataloader
quantizer = Quantization('conf.yaml')
quantizer.eval_func = eval_func
quantizer.calib_dataloader = calib_dataloader
quantizer.model = common.Model(model)
model = quantizer.fit()
model.save(training_args.output_dir)
from neural_compressor.quantization import fit
from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
tuning_criterion = TuningCriterion(max_trials=600)
conf = PostTrainingQuantConfig(approach="dynamic", backend="pytorch",
tuning_criterion=tuning_criterion)
q_model = fit(model, conf=conf, eval_func=eval_func)
```
Expand Up @@ -28,4 +28,4 @@ tuning:
exit_policy:
timeout: 0 # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit.
max_trials: 600
random_seed: 9527 # optional. random seed for deterministic tuning.
random_seed: 9527 # optional. random seed for deterministic tuning.
Expand Up @@ -498,15 +498,13 @@ def eval_func(model):

# optimize and quantize with Neural Compressor
if model_args.tune:
from neural_compressor.experimental import Quantization, common
calib_dataloader = eval_dataloader
quantizer = Quantization('conf.yaml')
quantizer.eval_func = eval_func
quantizer.calib_dataloader = calib_dataloader
quantizer.model = common.Model(model)
model = quantizer.fit()
from neural_compressor.quantization import fit
from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
tuning_criterion = TuningCriterion(max_trials=600)
conf = PostTrainingQuantConfig(approach="dynamic", backend="pytorch", tuning_criterion=tuning_criterion)
q_model = fit(model, conf=conf, eval_func=eval_func)
from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
save_for_huggingface_upstream(model, tokenizer, training_args.output_dir)
save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
return

if model_args.benchmark or model_args.accuracy_only:
Expand Down
Expand Up @@ -106,43 +106,12 @@ We also upstreamed several int8 models into HuggingFace [model hub](https://hugg
## This is a tutorial of how to enable NLP model with Intel® Neural Compressor.


### Intel® Neural Compressor supports two usages:

1. User specifies fp32 'model', calibration dataset 'q_dataloader', evaluation dataset "eval_dataloader" and metrics in tuning.metrics field of model-specific yaml config file.
2. User specifies fp32 'model', calibration dataset 'q_dataloader' and a custom "eval_func" which encapsulates the evaluation dataset and metrics by itself.

As MRPC's metrics are 'f1', 'acc_and_f1', mcc', 'spearmanr', 'acc', so customer should provide evaluation function 'eval_func', it's suitable for the second use case.

### Write Yaml config file

In examples directory, there is conf.yaml. We could remove most of the items and only keep mandatory item for tuning.

```yaml
model:
name: bert
framework: pytorch_fx

device: cpu

quantization:
approach: post_training_static_quant

tuning:
accuracy_criterion:
relative: 0.01
exit_policy:
timeout: 0
max_trials: 300
random_seed: 9527
```

Here we set accuracy target as tolerating 0.01 relative accuracy loss of baseline. The default tuning strategy is basic strategy. The timeout 0 means early stop as well as a tuning config meet accuracy target.

> **Note** : neural_compressor does NOT support "mse" tuning strategy for pytorch framework
### Intel® Neural Compressor supports usage:
* User specifies fp32 'model', calibration dataset 'q_dataloader' and a custom "eval_func" which encapsulates the evaluation dataset and metrics by itself.

### Code Prepare

We just need update run_squad_tune.py and run_glue.py like below
We just need update run_glue.py like below

```python
trainer = Trainer(
Expand Down Expand Up @@ -170,22 +139,13 @@ def take_eval_steps(model, trainer, metric_name, save_metrics=False):
def eval_func(model):
return take_eval_steps(model, trainer, metric_name)

from neural_compressor.experimental import Quantization, common
if (
not training_args.dataloader_drop_last
and eval_dataset.shape[0] % training_args.per_device_eval_batch_size != 0
):
raise ValueError(
"The number of samples of the dataset is not a multiple of the batch size."
"Use --dataloader_drop_last to overcome."
)
calib_dataloader = eval_dataloader
quantizer = Quantization('conf.yaml')
quantizer.eval_func = eval_func
quantizer.calib_dataloader = calib_dataloader
quantizer.model = common.Model(model)
model = quantizer.fit()
model.save(training_args.output_dir)
from neural_compressor.quantization import fit
from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
tuning_criterion = TuningCriterion(max_trials=600)
conf = PostTrainingQuantConfig(approach="static", tuning_criterion=tuning_criterion)
q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
```

# Appendix
Expand Down
Expand Up @@ -28,4 +28,4 @@ tuning:
exit_policy:
timeout: 0 # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit.
max_trials: 600
random_seed: 9527 # optional. random seed for deterministic tuning.
random_seed: 9527 # optional. random seed for deterministic tuning.
Expand Up @@ -504,7 +504,7 @@ def eval_func(model):
from neural_compressor.quantization import fit
from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
tuning_criterion = TuningCriterion(max_trials=600)
conf = PostTrainingQuantConfig(approach="static", backend="pytorch_fx", tuning_criterion=tuning_criterion)
conf = PostTrainingQuantConfig(approach="static", tuning_criterion=tuning_criterion)
q_model = fit(model, conf=conf, calib_dataloader=eval_dataloader, eval_func=eval_func)
from neural_compressor.utils.load_huggingface import save_for_huggingface_upstream
save_for_huggingface_upstream(q_model, tokenizer, training_args.output_dir)
Expand Down
Expand Up @@ -38,9 +38,7 @@ PyTorch 1.8 is needed for pytorch_fx backend and huggingface/transformers.
### 1. Enable bert-base-cased/uncased example with the auto quantization aware training strategy of Neural Compressor.

The changes made are as follows:
1. add conf_qat.yaml:
This file contains the configuration of quantization.
2. edit run_glue_tune.py:
* edit run_glue.py:
- For quantization, We used neural_compressor in it.
- For training, we enbaled early stop strategy.

Expand All @@ -50,7 +48,7 @@ PyTorch 1.8 is needed for pytorch_fx backend and huggingface/transformers.

or

python run_glue_tune.py \
python run_glue.py \
--model_name_or_path ${input_model} \
--task_name ${task_name} \
--do_train \
Expand All @@ -77,7 +75,7 @@ or

or

python run_glue_tune.py \
python run_glue.py \
--model_name_or_path ${input_model}/${tuned_checkpoint} \
--task_name ${task_name} \
--do_train \
Expand Down

0 comments on commit 2483a84

Please sign in to comment.