Add onnx whisper-large quantization example (#920)

intel · Jul 3, 2023 · 038be06 · 038be06
1 parent 683cdc9
commit 038be06
Show file tree

Hide file tree

Showing 8 changed files with 467 additions and 0 deletions.
diff --git a/examples/.config/onnx_optimize.json b/examples/.config/onnx_optimize.json
@@ -0,0 +1,52 @@
+{
+  "whisper_large_static": {
+    "working_dir": "huggingface/onnxruntime/speech-recognition/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "config": "/tf_dataset2/models/onnx/whisper_large",
+        "approach": "static",
+        "output_model": "whisper-large-with-past-static",
+        "input_model": "/tf_dataset2/models/onnx/whisper_large",
+        "dataset_location": "/tf_dataset2/datasets/datasets_cache"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "config": "/tf_dataset2/models/onnx/whisper_large",
+        "mode": "accuracy",
+        "batch_size": "1",
+        "iters": "100",
+        "input_model": "whisper-large-with-past-static",
+        "dataset_location": "/tf_dataset2/datasets/datasets_cache",
+        "int8": "false"
+      }
+    }
+  },
+  "whisper_large_dynamic": {
+    "working_dir": "huggingface/onnxruntime/speech-recognition/quantization",
+    "tune":{
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "config": "/tf_dataset2/models/onnx/whisper_large",
+        "approach": "dynamic",
+        "output_model": "whisper-large-with-past-dynamic",
+        "input_model": "/tf_dataset2/models/onnx/whisper_large",
+        "dataset_location": "/tf_dataset2/datasets/datasets_cache"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "config": "/tf_dataset2/models/onnx/whisper_large",
+        "mode": "accuracy",
+        "batch_size": "1",
+        "iters": "100",
+        "input_model": "whisper-large-with-past-dynamic",
+        "dataset_location": "/tf_dataset2/datasets/datasets_cache",
+        "int8": "false"
+      }
+    }
+  },
+}
diff --git a/examples/huggingface/onnxruntime/README.md b/examples/huggingface/onnxruntime/README.md
@@ -0,0 +1,3 @@
+we have [optimization](optimization_README.md) examples.
+
+
diff --git a/examples/huggingface/onnxruntime/optimization_README.md b/examples/huggingface/onnxruntime/optimization_README.md
@@ -0,0 +1,10 @@
+# Huggingface Examples
+
+Welcome to ONNX Runtime Huggingface examples. The models are from [Huggingface](https://huggingface.co) and model compressor technology is dependend on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). 
+
+## Quantization approach
+
+| Task | PostTrainingDynamic | PostTrainingStatic
+|---|:---:|:---:|
+|**`speech-recognition`**| ✅ | ✅ |
+
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md b/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md
@@ -0,0 +1,62 @@
+Step-by-Step
+============
+The script `run_whisper.py` provides two quantization approaches (PostTrainingStatic and PostTrainingDynamic) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) with [LibriSpeech test-clean](https://huggingface.co/datasets/librispeech_asr) dataset.
+
+# Prerequisite
+## 1. Create Environment
+```shell
+pip install -r requirements.txt
+```
+
+## 2. Prepare Model<200b>
+```
+optimum-cli export onnx --model openai/whisper-large whisper-large-with-past/ --task automatic-speech-recognition-with-past --opset 13
+```
+
+# Run
+## 1. Quantization
+
+- To get int8 model
+
+```
+bash run_tuning.sh --config=openai/whisper-large \
+                   --dataset_location=/path/to/dataset \ # optional
+                   --input_model=whisper-large-with-past/ \
+                   --output_model=whisper-large-with-past-static/ \ # or whisper-large-with-past-dynamic
+                   --approach=static # or dynamic
+```
+
+- To get model accuracy
+
+```
+bash run_benchmark.sh --config=whisper-large-with-past \
+                      --dataset_location=/path/to/dataset \ # optional
+                      --input_model=whisper-large-with-past-static/ \
+                      --int8 \
+                      --mode=accuracy
+```
+
+- To get model performance
+
+```
+numactl -m 0 -C 0-3 bash run_benchmark.sh --config=whisper-large-with-past \
+                                          --dataset_location=/path/to/dataset \ # optional
+                                          --input_model=whisper-large-with-past-static/ \
+                                          --mode=benchmark \
+                                          --iters=100 \
+                                          --cores_per_instance=4 \
+                                          --int8 \
+                                          --max_new_tokens=16
+```
+
+**Notes**: 
+ - If users don't set dataset_location, it will download the dataset or use the cached dataset automatically.
+ - numactl command is used to bind specific cores.
+
+# Validated model list
+
+|Topology|Pretrained model|PostTrainingDynamic|PostTrainingStatic
+|---|------------------------------------|---|---
+|whisper_large|openai/whisper-large| ✅| ✅|
+
+
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/requirements.txt b/examples/huggingface/onnxruntime/speech-recognition/quantization/requirements.txt
@@ -0,0 +1,11 @@
+datasets
+torch
+transformers
+jiwer
+optimum
+onnx
+onnxruntime
+evaluate
+neural-compressor
+librosa
+soundfile
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/run_benchmark.sh b/examples/huggingface/onnxruntime/speech-recognition/quantization/run_benchmark.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  dataset_location=$HOME/.cache/huggingface
+  script="run_whisper.py"
+  for var in "$@"
+  do
+    case $var in
+      --config=*)
+          config=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --cores_per_instance=*)
+          cores_per_instance=$(echo $var |cut -f2 -d=)
+      ;;
+      --max_new_tokens=*)
+          max_new_tokens=$(echo $var |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+
+    if [[ ${int8} == "false" ]]; then
+        input_model=${config}
+    fi
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy_only"
+    elif [[ ${mode} == "benchmark" ]]; then
+        mode_cmd=" --benchmark"
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+
+    python -u ${script} \
+        --model_name_or_path ${config} \
+        --cache_dir ${dataset_location} \
+        --cores_per_instance ${cores_per_instance-4} \
+        --input_model ${input_model} \
+        --max_new_tokens ${max_new_tokens-16} \
+        --iters ${iters} \
+        ${mode_cmd}
+}
+
+main "$@"
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/run_tuning.sh b/examples/huggingface/onnxruntime/speech-recognition/quantization/run_tuning.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  approach="static"
+  script="run_whisper.py"
+  dataset_location=$HOME/.cache/huggingface
+  for var in "$@"
+  do
+    case $var in
+      --config=*)
+          config=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --approach=*)
+          approach=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+
+    python -u ${script} \
+        --model_name_or_path ${config} \
+        --input_model ${input_model} \
+        --output_model ${output_model} \
+        --cache_dir ${dataset_location} \
+        --tune \
+        --approach ${approach}
+
+}
+
+main "$@"