-
Notifications
You must be signed in to change notification settings - Fork 446
/
configuration.py
1047 lines (924 loc) · 50 KB
/
configuration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Configuration classes for graph optimization and quantization with ONNX Runtime."""
import os
import warnings
from dataclasses import asdict, dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from datasets import Dataset
from packaging.version import Version, parse
from onnxruntime import __version__ as ort_version
from onnxruntime.quantization import CalibraterBase, CalibrationMethod, QuantFormat, QuantizationMode, QuantType
from onnxruntime.quantization.calibrate import create_calibrator
from onnxruntime.quantization.registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
from onnxruntime.transformers.fusion_options import FusionOptions
from ..configuration_utils import BaseConfig
from ..utils import logging
logger = logging.get_logger(__name__)
# This value is used to indicate ORT which axis it should use to quantize an operator "per-channel"
ORT_DEFAULT_CHANNEL_FOR_OPERATORS = {"MatMul": 1}
# Reference: https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py
ORT_DEFAULT_OPS_DYNAMIC_QUANTIZATION = list(IntegerOpsRegistry.keys())
ORT_DEFAULT_OPS_STATIC_QUANTIZATION_QDQ = list(QDQRegistry.keys())
ORT_DEFAULT_OPS_STATIC_QUANTIZATION_QOPS = list(QLinearOpsRegistry.keys())
@dataclass
class CalibrationConfig:
"""
CalibrationConfig is the configuration class handling all the ONNX Runtime parameters related to the calibration
step of static quantization.
Args:
dataset_name (`str`):
The name of the calibration dataset.
dataset_config_name (`str`):
The name of the calibration dataset configuration.
dataset_split (`str`):
Which split of the dataset is used to perform the calibration step.
dataset_num_samples (`int`):
The number of samples composing the calibration dataset.
method (`CalibrationMethod`):
The method chosen to calculate the activations quantization parameters using the calibration dataset.
num_bins (`Optional[int]`, defaults to `None`):
The number of bins to use when creating the histogram when performing the calibration step using the
Percentile or Entropy method.
num_quantized_bins (`Optional[int]`, defaults to `None`):
The number of quantized bins to use when performing the calibration step using the Entropy method.
percentile (`Optional[float]`, defaults to `None`):
The percentile to use when computing the activations quantization ranges when performing the calibration
step using the Percentile method.
moving_average (`Optional[bool]`, defaults to `None`):
Whether to compute the moving average of the minimum and maximum values when performing the calibration step
using the MinMax method.
averaging_constant (`Optional[float]`, defaults to `None`):
The constant smoothing factor to use when computing the moving average of the minimum and maximum values.
Effective only when the MinMax calibration method is selected and `moving_average` is set to True.
"""
dataset_name: str
dataset_config_name: str
dataset_split: str
dataset_num_samples: int
method: CalibrationMethod
num_bins: Optional[int] = None
num_quantized_bins: Optional[int] = None
percentile: Optional[float] = None
moving_average: Optional[bool] = None
averaging_constant: Optional[float] = None
def create_calibrator(
self,
onnx_model_path: Union[str, os.PathLike, Path],
operators_to_quantize: Optional[List[str]],
use_external_data_format: bool = False,
force_symmetric_range: bool = False,
augmented_model_name: str = "augmented_model.onnx",
) -> CalibraterBase:
kwargs = {
"model": onnx_model_path,
"op_types_to_calibrate": operators_to_quantize or [],
"calibrate_method": self.method,
"augmented_model_path": augmented_model_name,
}
if parse(ort_version) > Version("1.10.0"):
kwargs["use_external_data_format"] = use_external_data_format
kwargs["extra_options"] = {
"symmetric": force_symmetric_range,
"num_bins": self.num_bins,
"num_quantized_bins": self.num_quantized_bins,
"percentile": self.percentile,
"moving_average": self.moving_average,
"averaging_constant": self.averaging_constant,
}
return create_calibrator(**kwargs)
class AutoCalibrationConfig:
@staticmethod
def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: float = 0.01) -> CalibrationConfig:
"""
Args:
dataset (`Dataset`):
The dataset to use when performing the calibration step.
moving_average (`bool`):
Whether to compute the moving average of the minimum and maximum values.
averaging_constant (`float`):
The constant smoothing factor to use when computing the moving average of the minimum and maximum
values.
Returns:
The calibration configuration.
"""
if moving_average and parse(ort_version) < Version("1.11.0"):
raise NotImplementedError(
"MinMax calibration using the moving average method is only implemented for onnxruntime >= 1.11.0"
)
if moving_average and not 0 <= averaging_constant <= 1:
raise ValueError(f"Invalid averaging constant value ({averaging_constant}) should be within [0, 1]")
return CalibrationConfig(
dataset_name=dataset.info.builder_name,
dataset_config_name=dataset.info.config_name,
dataset_split=str(dataset.split),
dataset_num_samples=dataset.num_rows,
method=CalibrationMethod.MinMax,
moving_average=moving_average,
averaging_constant=averaging_constant,
)
@staticmethod
def entropy(
dataset: Dataset,
num_bins: int = 128,
num_quantized_bins: int = 128,
) -> CalibrationConfig:
"""
Args:
dataset (`Dataset`):
The dataset to use when performing the calibration step.
num_bins (`int`):
The number of bins to use when creating the histogram.
num_quantized_bins (`int`):
The number of quantized bins used to find the optimal threshold when computing the activations
quantization ranges.
Returns:
The calibration configuration.
"""
if parse(ort_version) < Version("1.11.0"):
raise NotImplementedError("Entropy calibration method is only implemented for onnxruntime >= 1.11.0")
if num_bins <= 0:
raise ValueError(f"Invalid value num_bins ({num_bins}) should be >= 1")
if num_quantized_bins <= 0:
raise ValueError(f"Invalid value num_quantized_bins ({num_quantized_bins}) should be >= 1")
return CalibrationConfig(
dataset_name=dataset.info.builder_name,
dataset_config_name=dataset.info.config_name,
dataset_split=str(dataset.split),
dataset_num_samples=dataset.num_rows,
method=CalibrationMethod.Entropy,
num_bins=num_bins,
num_quantized_bins=num_quantized_bins,
)
@staticmethod
def percentiles(dataset: Dataset, num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig:
"""
Args:
dataset (`Dataset`):
The dataset to use when performing the calibration step.
num_bins (`int`):
The number of bins to use when creating the histogram.
percentile (`float`):
The percentile to use when computing the activations quantization ranges.
Returns:
The calibration configuration.
"""
if parse(ort_version) < Version("1.11.0"):
raise NotImplementedError("Percentile calibration method is only implemented for onnxruntime >= 1.11.0")
if num_bins <= 0:
raise ValueError(f"Invalid value num_bins ({num_bins}) should be >= 1")
if not 0 <= percentile <= 100:
raise ValueError(f"Invalid value percentile ({percentile}) should be within [0, 100]")
return CalibrationConfig(
dataset_name=dataset.info.builder_name,
dataset_config_name=dataset.info.config_name,
dataset_split=str(dataset.split),
dataset_num_samples=dataset.num_rows,
method=CalibrationMethod.Percentile,
num_bins=num_bins,
percentile=percentile,
)
@dataclass
class QuantizationConfig:
"""
QuantizationConfig is the configuration class handling all the ONNX Runtime quantization parameters.
Args:
is_static (`bool`):
Whether to apply static quantization or dynamic quantization.
format (`QuantFormat`):
Targeted ONNX Runtime quantization representation format.
For the Operator Oriented (QOperator) format, all the quantized operators have their own ONNX definitions.
For the Tensor Oriented (QDQ) format, the model is quantized by inserting QuantizeLinear / DeQuantizeLinear
operators.
mode (`QuantizationMode`, defaults to `QuantizationMode.QLinearOps`):
Targeted ONNX Runtime quantization mode, default is QLinearOps to match QDQ format.
When targeting dynamic quantization mode, the default value is `QuantizationMode.IntegerOps` whereas the
default value for static quantization mode is `QuantizationMode.QLinearOps`.
activations_dtype (`QuantType`, defaults to `QuantType.QUInt8`):
The quantization data types to use for the activations.
activations_symmetric (`bool`, defaults to `False`):
Whether to apply symmetric quantization on the activations.
weights_dtype (`QuantType`, defaults to `QuantType.QInt8`):
The quantization data types to use for the weights.
weights_symmetric (`bool`, defaults to `True`):
Whether to apply symmetric quantization on the weights.
per_channel (`bool`, defaults to `False`):
Whether we should quantize per-channel (also known as "per-row"). Enabling this can increase overall
accuracy while making the quantized model heavier.
reduce_range (`bool`, defaults to `False`):
Whether to use reduce-range 7-bits integers instead of 8-bits integers.
nodes_to_quantize (`List[str]`, defaults to `[]`):
List of the nodes names to quantize. When unspecified, all nodes will be quantized. If empty, all nodes being operators from `operators_to_quantize` will be quantized.
nodes_to_exclude (`List[str]`, defaults to `[]`):
List of the nodes names to exclude when applying quantization. The list of nodes in a model can be found loading the ONNX model through onnx.load, or through visual inspection with [netron](https://github.com/lutzroeder/netron).
operators_to_quantize (`List[str]`):
List of the operators types to quantize. Defaults to all quantizable operators for the given quantization mode and format. Quantizable operators can be found at https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py.
qdq_add_pair_to_weight (`bool`, defaults to `False`):
By default, floating-point weights are quantized and feed to solely inserted DeQuantizeLinear node.
If set to True, the floating-point weights will remain and both QuantizeLinear / DeQuantizeLinear nodes
will be inserted.
qdq_dedicated_pair (`bool`, defaults to `False`):
When inserting QDQ pair, multiple nodes can share a single QDQ pair as their inputs. If True, it will
create an identical and dedicated QDQ pair for each node.
qdq_op_type_per_channel_support_to_axis (`Dict[str, int]`):
Set the channel axis for a specific operator type. Effective only when per channel quantization is
supported and `per_channel` is set to True.
"""
is_static: bool
format: QuantFormat
mode: QuantizationMode = QuantizationMode.QLinearOps
activations_dtype: QuantType = QuantType.QUInt8
activations_symmetric: bool = False
weights_dtype: QuantType = QuantType.QInt8
weights_symmetric: bool = True
per_channel: bool = False
reduce_range: bool = False
nodes_to_quantize: List[str] = field(default_factory=list)
nodes_to_exclude: List[str] = field(default_factory=list)
operators_to_quantize: List[str] = field(default_factory=list)
qdq_add_pair_to_weight: bool = False
qdq_dedicated_pair: bool = False
qdq_op_type_per_channel_support_to_axis: Dict[str, int] = field(
default_factory=lambda: ORT_DEFAULT_CHANNEL_FOR_OPERATORS
)
def __post_init__(self):
ensure_valid_mode_or_raise(self.is_static, self.mode)
ensure_valid_data_type_or_raise(self.is_static, self.activations_dtype, self.weights_dtype)
# If needed, dynamically set operators_to_quantize default.
if len(self.operators_to_quantize) == 0:
_, _, operators_to_quantize = default_quantization_parameters(
self.is_static, self.format, self.mode, self.operators_to_quantize
)
self.operators_to_quantize = operators_to_quantize
if isinstance(self.format, str):
self.format = QuantFormat[self.format]
if isinstance(self.mode, str):
self.mode = QuantizationMode[self.mode]
if isinstance(self.activations_dtype, str):
self.activations_dtype = QuantType[self.activations_dtype]
if isinstance(self.weights_dtype, str):
self.weights_dtype = QuantType[self.weights_dtype]
@staticmethod
def quantization_type_str(activations_dtype: QuantType, weights_dtype: QuantType) -> str:
return (
f"{'s8' if activations_dtype == QuantType.QInt8 else 'u8'}"
f"/"
f"{'s8' if weights_dtype == QuantType.QInt8 else 'u8'}"
)
@property
def use_symmetric_calibration(self) -> bool:
return self.activations_symmetric and self.weights_symmetric
def __str__(self):
return (
f"{self.format} ("
f"mode: {self.mode}, "
f"schema: {QuantizationConfig.quantization_type_str(self.activations_dtype, self.weights_dtype)}, "
f"channel-wise: {self.per_channel})"
)
def ensure_valid_mode_or_raise(use_static_quantization: bool, mode: QuantizationMode):
if not use_static_quantization and mode == QuantizationMode.QLinearOps:
raise ValueError(
"Invalid combination of "
"use_static_quantization = False "
"and "
"mode = QuantizationMode.QLinearOps. "
"OnnxRuntime dynamic quantization requires mode = QuantizationMode.IntegerOps"
)
def ensure_valid_data_type_or_raise(
use_static_quantization: bool, activations_dtype: QuantType, weights_dtype: QuantType
):
if not use_static_quantization and activations_dtype == QuantType.QInt8:
raise ValueError(
"Invalid combination of "
"use_static_quantization = False "
"and "
"activations_dtype = QuantType.QInt8. "
"OnnxRuntime dynamic quantization requires activations_dtype = QuantType.QUInt8"
)
if use_static_quantization and activations_dtype == QuantType.QInt8 and weights_dtype == QuantType.QUInt8:
raise ValueError(
"Invalid combination of "
"use_static_quantization = True, "
"activations_dtype = QuantType.QInt8 "
"and "
"weights_dtype = QuantType.QUInt8."
"OnnxRuntime static quantization does not support "
"activations_dtype = QuantType.QInt8 with weights_dtype = QuantType.QUInt8."
)
def default_quantization_parameters(
is_static: bool,
format: Optional[QuantFormat] = None,
mode: Optional[QuantizationMode] = None,
operators_to_quantize: Optional[List[str]] = None,
) -> Tuple[QuantFormat, QuantizationMode, List[str]]:
if format is None:
format = QuantFormat.QDQ if is_static else QuantFormat.QOperator
if mode is None:
mode = QuantizationMode.QLinearOps if is_static else QuantizationMode.IntegerOps
if operators_to_quantize is None or len(operators_to_quantize) == 0:
if is_static and format == QuantFormat.QDQ:
operators_to_quantize = ORT_DEFAULT_OPS_STATIC_QUANTIZATION_QDQ
elif is_static and mode == QuantizationMode.QLinearOps:
operators_to_quantize = ORT_DEFAULT_OPS_STATIC_QUANTIZATION_QOPS
elif not is_static and mode == QuantizationMode.IntegerOps:
operators_to_quantize = ORT_DEFAULT_OPS_DYNAMIC_QUANTIZATION
return format, mode, operators_to_quantize
class AutoQuantizationConfig:
@staticmethod
def arm64(
is_static: bool,
use_symmetric_activations: bool = False,
use_symmetric_weights: bool = True,
per_channel: bool = True,
nodes_to_quantize: Optional[List[str]] = None,
nodes_to_exclude: Optional[List[str]] = None,
operators_to_quantize: Optional[List[str]] = None,
):
"""
Creates a [`~onnxruntime.QuantizationConfig`] fit for ARM64.
Args:
is_static (`bool`):
Boolean flag to indicate whether we target static or dynamic quantization.
use_symmetric_activations (`bool`, defaults to `False`):
Whether to use symmetric quantization for activations.
use_symmetric_weights (`bool`, defaults to `True`):
Whether to use symmetric quantization for weights.
per_channel (`bool`, defaults to `True`):
Whether we should quantize per-channel (also known as "per-row"). Enabling this can
increase overall accuracy while making the quantized model heavier.
nodes_to_quantize (`Optional[List[str]]`, defaults to `None`):
Specific nodes to quantize. If `None`, all nodes being operators from `operators_to_quantize` will be quantized.
nodes_to_exclude (`Optional[List[str]]`, defaults to `None`):
Specific nodes to exclude from quantization. The list of nodes in a model can be found loading the ONNX model through onnx.load, or through visual inspection with [netron](https://github.com/lutzroeder/netron).
operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
Type of nodes to perform quantization on. By default, all the quantizable operators will be quantized. Quantizable operators can be found at https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py.
"""
format, mode, operators_to_quantize = default_quantization_parameters(
is_static, operators_to_quantize=operators_to_quantize
)
# u8/s8 is faster (than u8/u8) on lower-end ARM64 and identical on higher-end ARM64,
# so let's use u8/s8 by default
return QuantizationConfig(
is_static=is_static,
format=format,
mode=mode,
activations_dtype=QuantType.QUInt8,
activations_symmetric=use_symmetric_activations,
weights_dtype=QuantType.QInt8,
weights_symmetric=use_symmetric_weights,
per_channel=per_channel,
reduce_range=False,
nodes_to_quantize=nodes_to_quantize or [],
nodes_to_exclude=nodes_to_exclude or [],
operators_to_quantize=operators_to_quantize,
)
@staticmethod
def avx2(
is_static: bool,
use_symmetric_activations: bool = False,
use_symmetric_weights: bool = True,
per_channel: bool = True,
reduce_range: bool = False,
nodes_to_quantize: Optional[List[str]] = None,
nodes_to_exclude: Optional[List[str]] = None,
operators_to_quantize: Optional[List[str]] = None,
) -> QuantizationConfig:
"""
Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX2 instruction set.
Args:
is_static (`bool`):
Boolean flag to indicate whether we target static or dynamic quantization.
use_symmetric_activations (`bool`, defaults to `False`):
Whether to use symmetric quantization for activations.
use_symmetric_weights (`bool`, defaults to `True`):
Whether to use symmetric quantization for weights.
per_channel (`bool`, defaults to `True`):
Whether we should quantize per-channel (also known as "per-row"). Enabling this can
increase overall accuracy while making the quantized model heavier.
reduce_range (`bool`, defaults to `False`):
Indicate whether to use 8-bits integers (False) or reduce-range 7-bits integers (True).
As a baseline, it is always recommended testing with full range (reduce_range = False) and then, if
accuracy drop is significant, to try with reduced range (reduce_range = True).
Intel's CPUs using AVX512 (non VNNI) can suffer from saturation issue when invoking
the VPMADDUBSW instruction. To counter this, one should use 7-bits rather than 8-bits integers.
nodes_to_quantize (`Optional[List[str]]`, defaults to `None`):
Specific nodes to quantize. If `None`, all nodes being operators from `operators_to_quantize` will be quantized.
nodes_to_exclude (`Optional[List[str]]`, defaults to `None`):
Specific nodes to exclude from quantization. The list of nodes in a model can be found loading the ONNX model through onnx.load, or through visual inspection with [netron](https://github.com/lutzroeder/netron).
operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
Type of nodes to perform quantization on. By default, all the quantizable operators will be quantized. Quantizable operators can be found at https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py.
"""
format, mode, operators_to_quantize = default_quantization_parameters(
is_static, operators_to_quantize=operators_to_quantize
)
return QuantizationConfig(
is_static=is_static,
format=format,
mode=mode,
activations_dtype=QuantType.QUInt8,
activations_symmetric=use_symmetric_activations,
weights_dtype=QuantType.QUInt8,
weights_symmetric=use_symmetric_weights,
per_channel=per_channel,
reduce_range=reduce_range,
nodes_to_quantize=nodes_to_quantize or [],
nodes_to_exclude=nodes_to_exclude or [],
operators_to_quantize=operators_to_quantize,
)
@staticmethod
def avx512(
is_static: bool,
use_symmetric_activations: bool = False,
use_symmetric_weights: bool = True,
per_channel: bool = True,
reduce_range: bool = False,
nodes_to_quantize: Optional[List[str]] = None,
nodes_to_exclude: Optional[List[str]] = None,
operators_to_quantize: Optional[List[str]] = None,
) -> QuantizationConfig:
"""
Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX512 instruction set.
Args:
is_static (`bool`):
Boolean flag to indicate whether we target static or dynamic quantization.
use_symmetric_activations (`bool`, defaults to `False`):
Whether to use symmetric quantization for activations.
use_symmetric_weights (`bool`, defaults to `True`):
Whether to use symmetric quantization for weights.
per_channel (`bool`, defaults to `True`):
Whether we should quantize per-channel (also known as "per-row"). Enabling this can
increase overall accuracy while making the quantized model heavier.
reduce_range (`bool`, defaults to `False`):
Indicate whether to use 8-bits integers (False) or reduce-range 7-bits integers (True).
As a baseline, it is always recommended testing with full range (reduce_range = False) and then, if
accuracy drop is significant, to try with reduced range (reduce_range = True).
Intel's CPUs using AVX512 (non VNNI) can suffer from saturation issue when invoking
the VPMADDUBSW instruction. To counter this, one should use 7-bits rather than 8-bits integers.
nodes_to_quantize (`Optional[List[str]]`, defaults to `None`):
Specific nodes to quantize. If `None`, all nodes being operators from `operators_to_quantize` will be quantized.
nodes_to_exclude (`Optional[List[str]]`, defaults to `None`):
Specific nodes to exclude from quantization. The list of nodes in a model can be found loading the ONNX model through onnx.load, or through visual inspection with [netron](https://github.com/lutzroeder/netron).
operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
Type of nodes to perform quantization on. By default, all the quantizable operators will be quantized. Quantizable operators can be found at https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py.
"""
format, mode, operators_to_quantize = default_quantization_parameters(
is_static, operators_to_quantize=operators_to_quantize
)
return QuantizationConfig(
is_static=is_static,
format=format,
mode=mode,
activations_dtype=QuantType.QUInt8,
activations_symmetric=use_symmetric_activations,
weights_dtype=QuantType.QInt8,
weights_symmetric=use_symmetric_weights,
per_channel=per_channel,
reduce_range=reduce_range,
nodes_to_quantize=nodes_to_quantize or [],
nodes_to_exclude=nodes_to_exclude or [],
operators_to_quantize=operators_to_quantize,
)
@staticmethod
def avx512_vnni(
is_static: bool,
use_symmetric_activations: bool = False,
use_symmetric_weights: bool = True,
per_channel: bool = True,
nodes_to_quantize: Optional[List[str]] = None,
nodes_to_exclude: Optional[List[str]] = None,
operators_to_quantize: Optional[List[str]] = None,
) -> QuantizationConfig:
"""
Creates a [`~onnxruntime.QuantizationConfig`] fit for CPU with AVX512-VNNI instruction set.
When targeting Intel AVX512-VNNI CPU underlying execution engine leverage the CPU instruction VPDPBUSD to
compute \\i32 += i8(w) * u8(x)\\ within a single instruction.
AVX512-VNNI (AVX512 Vector Neural Network Instruction)
is an x86 extension Instruction set and is a part of the AVX-512 ISA.
AVX512 VNNI is designed to accelerate convolutional neural network for INT8 inference.
Args:
is_static (`bool`):
Boolean flag to indicate whether we target static or dynamic quantization.
use_symmetric_activations (`bool`, defaults to `False`):
Whether to use symmetric quantization for activations.
use_symmetric_weights (`bool`, defaults to `True`):
Whether to use symmetric quantization for weights.
per_channel (`bool`, defaults to `True`):
Whether we should quantize per-channel (also known as "per-row"). Enabling this can
increase overall accuracy while making the quantized model heavier.
nodes_to_quantize (`Optional[List[str]]`, defaults to `None`):
Specific nodes to quantize. If `None`, all nodes being operators from `operators_to_quantize` will be quantized.
nodes_to_exclude (`Optional[List[str]]`, defaults to `None`):
Specific nodes to exclude from quantization. The list of nodes in a model can be found loading the ONNX model through onnx.load, or through visual inspection with [netron](https://github.com/lutzroeder/netron).
operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
Type of nodes to perform quantization on. By default, all the quantizable operators will be quantized. Quantizable operators can be found at https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py.
"""
format, mode, operators_to_quantize = default_quantization_parameters(
is_static, operators_to_quantize=operators_to_quantize
)
return QuantizationConfig(
is_static=is_static,
format=format,
mode=mode,
activations_dtype=QuantType.QUInt8,
activations_symmetric=use_symmetric_activations,
weights_dtype=QuantType.QInt8,
weights_symmetric=use_symmetric_weights,
per_channel=per_channel,
reduce_range=False,
nodes_to_quantize=nodes_to_quantize or [],
nodes_to_exclude=nodes_to_exclude or [],
operators_to_quantize=operators_to_quantize,
)
@staticmethod
def tensorrt(
per_channel: bool = True,
nodes_to_quantize: Optional[List[str]] = None,
nodes_to_exclude: Optional[List[str]] = None,
operators_to_quantize: Optional[List[str]] = None,
) -> QuantizationConfig:
"""
Creates a [`~onnxruntime.QuantizationConfig`] fit for TensorRT static quantization, targetting NVIDIA GPUs.
Args:
per_channel (`bool`, defaults to `True`):
Whether we should quantize per-channel (also known as "per-row"). Enabling this can
increase overall accuracy while making the quantized model heavier.
nodes_to_quantize (`Optional[List[str]]`, defaults to `None`):
Specific nodes to quantize. If `None`, all nodes being operators from `operators_to_quantize` will be quantized.
nodes_to_exclude (`Optional[List[str]]`, defaults to `None`):
Specific nodes to exclude from quantization. The list of nodes in a model can be found loading the ONNX model through onnx.load, or through visual inspection with [netron](https://github.com/lutzroeder/netron).
operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
Type of nodes to perform quantization on. By default, all the quantizable operators will be quantized. Quantizable operators can be found at https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py.
"""
format, mode, operators_to_quantize = default_quantization_parameters(
is_static=True, operators_to_quantize=operators_to_quantize
)
return QuantizationConfig(
is_static=True,
format=format,
mode=mode,
activations_dtype=QuantType.QInt8,
activations_symmetric=True, # TRT only supports symmetric
weights_dtype=QuantType.QInt8,
weights_symmetric=True, # TRT only supports symmetric
per_channel=per_channel,
reduce_range=False,
nodes_to_quantize=nodes_to_quantize or [],
nodes_to_exclude=nodes_to_exclude or [],
operators_to_quantize=operators_to_quantize,
# `qdq_dedicated_pair=True` argument is required by TensorRT, since it expects a single node after each
# `QuantizeLinear` + `DequantizeLinear` (QDQ) pair.
qdq_add_pair_to_weight=True,
# `qdq_dedicated_pair=True` is required because TensorRT expects QDQ pairs on weights, not only DequantizeLinear
qdq_dedicated_pair=True,
)
@dataclass
class OptimizationConfig:
"""
OptimizationConfig is the configuration class handling all the ONNX Runtime optimization parameters.
There are two stacks of optimizations:
1. The ONNX Runtime general-purpose optimization tool: it can work on any ONNX model.
2. The ONNX Runtime transformers optimization tool: it can only work on a subset of transformers models.
Attributes:
optimization_level (`int`, defaults to 1):
Optimization level performed by ONNX Runtime of the loaded graph.
Supported optimization level are 0, 1, 2 and 99.
- 0: will disable all optimizations
- 1: will enable basic optimizations
- 2: will enable basic and extended optimizations, including complex node fusions applied to the nodes
assigned to the CPU or CUDA execution provider, making the resulting optimized graph hardware dependent
- 99: will enable all available optimizations including layout optimizations
optimize_for_gpu (`bool`, defaults to `False`):
Whether to optimize the model for GPU inference.
The optimized graph might contain operators for GPU or CPU only when `optimization_level` > 1.
fp16 (`bool`, defaults to `False`):
Whether all weights and nodes should be converted from float32 to float16.
enable_transformers_specific_optimizations (`bool`, defaults to `True`):
Whether to only use `transformers` specific optimizations on top of ONNX Runtime general optimizations.
disable_gelu_fusion (`bool`, defaults to `False`):
Whether to disable the Gelu fusion.
disable_layer_norm_fusion (`bool`, defaults to `False`):
Whether to disable Layer Normalization fusion.
disable_attention_fusion (`bool`, defaults to `False`):
Whether to disable Attention fusion.
disable_skip_layer_norm_fusion (`bool`, defaults to `False`):
Whether to disable SkipLayerNormalization fusion.
disable_bias_skip_layer_norm_fusion (`bool`, defaults to `False`):
Whether to disable Add Bias and SkipLayerNormalization fusion.
disable_bias_gelu_fusion (`bool`, defaults to `False`):
Whether to disable Add Bias and Gelu / FastGelu fusion.
disable_embed_layer_norm_fusion (`bool`, defaults to `True`):
Whether to disable EmbedLayerNormalization fusion.
The default value is set to `True` since this fusion is incompatible with ONNX Runtime quantization.
enable_gelu_approximation (`bool`, defaults to `False`):
Whether to enable Gelu / BiasGelu to FastGelu conversion.
The default value is set to `False` since this approximation might slightly impact the model's accuracy.
use_mask_index (`bool`, defaults to `False`):
Whether to use mask index instead of raw attention mask in the attention operator.
no_attention_mask (`bool`, defaults to `False`):
Whether to not use attention masks. Only works for bert model type.
disable_embed_layer_norm (`bool`, defaults to `True`):
Whether to disable EmbedLayerNormalization fusion.
The default value is set to `True` since this fusion is incompatible with ONNX Runtime quantization
disable_shape_inference (`bool`, defaults to `False`):
Whether to disable symbolic shape inference.
The default value is set to `False` but symbolic shape inference might cause issues sometimes.
use_multi_head_attention (`bool`, defaults to `False`):
Experimental argument. Use MultiHeadAttention instead of Attention operator, which has merged weights for Q/K/V projection,
which might be faster in some cases since 3 MatMul is merged into one."
"Note that MultiHeadAttention might be slower than Attention when qkv are not packed. "
enable_gemm_fast_gelu_fusion (`bool`, defaults to `False`):
Enable GemmfastGelu fusion.
use_raw_attention_mask (`bool`, defaults to `False`):
Use raw attention mask. Use this option if your input is not right-side padding. This might deactivate fused attention and get worse performance.
disable_group_norm_fusion (`bool`, defaults to `True`):
Do not fuse GroupNorm. Only works for model_type=unet.
disable_packed_kv (`bool`, defaults to `True`):
Do not use packed kv in cross attention. Only works for model_type=unet.
disable_rotary_embeddings (`bool`, defaults to `False`):
Whether to disable Rotary Embedding fusion.
"""
optimization_level: int = 1
optimize_for_gpu: bool = False
fp16: bool = False
optimize_with_onnxruntime_only: Optional[bool] = None
enable_transformers_specific_optimizations: bool = True
disable_gelu: Optional[bool] = None
disable_gelu_fusion: bool = False
disable_layer_norm: Optional[bool] = None
disable_layer_norm_fusion: bool = False
disable_attention: Optional[bool] = None
disable_attention_fusion: bool = False
disable_skip_layer_norm: Optional[bool] = None
disable_skip_layer_norm_fusion: bool = False
disable_bias_skip_layer_norm: Optional[bool] = None
disable_bias_skip_layer_norm_fusion: bool = False
disable_bias_gelu: Optional[bool] = None
disable_bias_gelu_fusion: bool = False
disable_embed_layer_norm: Optional[bool] = None
disable_embed_layer_norm_fusion: bool = True
enable_gelu_approximation: bool = False
use_mask_index: bool = False
no_attention_mask: bool = False
disable_embed_layer_norm: bool = True
disable_shape_inference: bool = False
# ONNX Runtime 1.14.0 arguments
use_multi_head_attention: bool = False
enable_gemm_fast_gelu_fusion: bool = False
use_raw_attention_mask: bool = False
disable_group_norm_fusion: bool = True
disable_packed_kv: bool = True
# ONNX Runtime 1.16.2 arguments
disable_rotary_embeddings: bool = False
def __post_init__(self):
def deprecate_renamed_attribute(old_name, new_name, mapping_func=None):
if getattr(self, old_name, None) is not None:
if mapping_func is None:
def identity(x):
return x
mapping_func = identity
setattr(self, new_name, mapping_func(getattr(self, old_name)))
warnings.warn(
f"{old_name} will be deprecated soon, use {new_name} instead, {new_name} is set to "
f"{getattr(self, new_name)}.",
FutureWarning,
)
deprecate_renamed_attribute(
"optimize_with_onnxruntime_only",
"enable_transformers_specific_optimizations",
mapping_func=lambda x: not x,
)
deprecate_renamed_attribute("disable_gelu", "disable_bias_gelu_fusion")
deprecate_renamed_attribute("disable_layer_norm", "disable_layer_norm_fusion")
deprecate_renamed_attribute("disable_attention", "disable_attention_fusion")
deprecate_renamed_attribute("disable_skip_layer_norm", "disable_skip_layer_norm_fusion")
deprecate_renamed_attribute("disable_bias_skip_layer_norm", "disable_bias_skip_layer_norm_fusion")
deprecate_renamed_attribute("disable_bias_gelu", "disable_bias_gelu_fusion")
deprecate_renamed_attribute("disable_embed_layer_norm", "disable_embed_layer_norm_fusion")
def create_fusion_options(self, model_type: str) -> FusionOptions:
class Box:
pass
args = Box()
args.model_type = model_type
attribute_map = {
"disable_gelu_fusion": "disable_gelu",
"disable_layer_norm_fusion": "disable_layer_norm",
"disable_attention_fusion": "disable_attention",
"disable_skip_layer_norm_fusion": "disable_skip_layer_norm",
"disable_bias_skip_layer_norm_fusion": "disable_bias_skip_layer_norm",
"disable_bias_gelu_fusion": "disable_bias_gelu",
"disable_embed_layer_norm_fusion": "disable_embed_layer_norm",
"disable_group_norm_fusion": "disable_group_norm",
"disable_packed_kv": "disable_packed_kv",
"use_raw_attention_mask": "use_raw_attention_mask",
"enable_gemm_fast_gelu_fusion": "enable_gemm_fast_gelu",
"use_multi_head_attention": "use_multi_head_attention",
"disable_rotary_embeddings": "disable_rotary_embeddings",
}
for attr_name, fusion_attr_name in attribute_map.items():
setattr(args, fusion_attr_name, getattr(self, attr_name))
for attr, value in self.__dict__.items():
if hasattr(args, attr):
continue
setattr(args, attr, value)
return FusionOptions.parse(args)
class AutoOptimizationConfig:
"""
Factory to create common `OptimizationConfig`.
"""
_LEVELS = {
"O1": {
"optimization_level": 1,
"enable_transformers_specific_optimizations": False,
},
"O2": {
"optimization_level": 2,
"enable_transformers_specific_optimizations": True,
},
"O3": {
"optimization_level": 2,
"enable_transformers_specific_optimizations": True,
"enable_gelu_approximation": True,
},
"O4": {
"optimization_level": 2,
"enable_transformers_specific_optimizations": True,
"enable_gelu_approximation": True,
"fp16": True,
},
}
@classmethod
def with_optimization_level(cls, optimization_level: str, for_gpu: bool = False, **kwargs) -> OptimizationConfig:
"""
Creates an [`~OptimizationConfig`] with pre-defined arguments according to an optimization level.
Args:
optimization_level (`str`):
The optimization level, the following values are allowed:
- O1: Basic general optimizations
- O2: Basic and extended general optimizations, transformers-specific fusions.
- O3: Same as O2 with Fast Gelu approximation.
- O4: Same as O3 with mixed precision.
for_gpu (`bool`, defaults to `False`):
Whether the model to optimize will run on GPU, some optimizations depends on the hardware the model
will run on. Only needed for optimization_level > 1.
kwargs (`Dict[str, Any]`):
Arguments to provide to the [`~OptimizationConfig`] constructor.
Returns:
`OptimizationConfig`: The `OptimizationConfig` corresponding to the requested optimization level.
"""
if optimization_level not in cls._LEVELS:
raise ValueError(
f"optimization_level must be in {', '.join(cls._LEVELS.keys())}, got {optimization_level}"
)
if optimization_level == "O4":
if for_gpu is False:
logger.warning("Overridding for_gpu=False to for_gpu=True as half precision is available only on GPU.")
for_gpu = True
return OptimizationConfig(optimize_for_gpu=for_gpu, **cls._LEVELS[optimization_level], **kwargs)
@classmethod
def O1(cls, for_gpu: bool = False, **kwargs) -> OptimizationConfig:
"""
Creates an O1 [`~OptimizationConfig`].
Args:
for_gpu (`bool`, defaults to `False`):
Whether the model to optimize will run on GPU, some optimizations depends on the hardware the model
will run on. Only needed for optimization_level > 1.
kwargs (`Dict[str, Any]`):
Arguments to provide to the [`~OptimizationConfig`] constructor.
Returns:
`OptimizationConfig`: The `OptimizationConfig` corresponding to the O1 optimization level.
"""
return cls.with_optimization_level("O1", for_gpu=for_gpu, **kwargs)
@classmethod
def O2(cls, for_gpu: bool = False, **kwargs) -> OptimizationConfig:
"""
Creates an O2 [`~OptimizationConfig`].
Args:
for_gpu (`bool`, defaults to `False`):
Whether the model to optimize will run on GPU, some optimizations depends on the hardware the model
will run on. Only needed for optimization_level > 1.
kwargs (`Dict[str, Any]`):
Arguments to provide to the [`~OptimizationConfig`] constructor.
Returns:
`OptimizationConfig`: The `OptimizationConfig` corresponding to the O2 optimization level.
"""
return cls.with_optimization_level("O2", for_gpu=for_gpu, **kwargs)
@classmethod
def O3(cls, for_gpu: bool = False, **kwargs) -> OptimizationConfig:
"""
Creates an O3 [`~OptimizationConfig`].
Args:
for_gpu (`bool`, defaults to `False`):
Whether the model to optimize will run on GPU, some optimizations depends on the hardware the model
will run on. Only needed for optimization_level > 1.
kwargs (`Dict[str, Any]`):
Arguments to provide to the [`~OptimizationConfig`] constructor.
Returns:
`OptimizationConfig`: The `OptimizationConfig` corresponding to the O3 optimization level.
"""
return cls.with_optimization_level("O3", for_gpu=for_gpu, **kwargs)
@classmethod
def O4(cls, for_gpu: bool = True, **kwargs) -> OptimizationConfig:
"""
Creates an O4 [`~OptimizationConfig`].
Args:
for_gpu (`bool`, defaults to `False`):
Whether the model to optimize will run on GPU, some optimizations depends on the hardware the model
will run on. Only needed for optimization_level > 1.
kwargs (`Dict[str, Any]`):
Arguments to provide to the [`~OptimizationConfig`] constructor.
Returns:
`OptimizationConfig`: The `OptimizationConfig` corresponding to the O4 optimization level.
"""
return cls.with_optimization_level("O4", for_gpu=for_gpu, **kwargs)
class ORTConfig(BaseConfig):
"""
ORTConfig is the configuration class handling all the ONNX Runtime parameters related to the ONNX IR model export,
optimization and quantization parameters.
Attributes:
opset (`Optional[int]`, defaults to `None`):
ONNX opset version to export the model with.
use_external_data_format (`bool`, defaults to `False`):
Allow exporting model >= than 2Gb.
one_external_file (`bool`, defaults to `True`):
When `use_external_data_format=True`, whether to save all tensors to one external file.
If false, save each tensor to a file named with the tensor name.
(Can not be set to `False` for the quantization)
optimization (`Optional[OptimizationConfig]`, defaults to `None`):
Specify a configuration to optimize ONNX Runtime model
quantization (`Optional[QuantizationConfig]`, defaults to `None`):
Specify a configuration to quantize ONNX Runtime model
"""
CONFIG_NAME = "ort_config.json"
FULL_CONFIGURATION_FILE = "ort_config.json"
def __init__(
self,
opset: Optional[int] = None,
use_external_data_format: bool = False,
one_external_file: bool = True,
optimization: Optional[OptimizationConfig] = None,
quantization: Optional[QuantizationConfig] = None,
**kwargs,
):
super().__init__()
self.opset = opset
self.use_external_data_format = use_external_data_format
self.one_external_file = one_external_file
if isinstance(optimization, dict) and optimization:
self.optimization = OptimizationConfig(**optimization)
elif isinstance(optimization, OptimizationConfig):
self.optimization = optimization