Skip to content

Commit

Permalink
Support ORT1.12 (#1203)
Browse files Browse the repository at this point in the history
  • Loading branch information
mengniwang95 committed Sep 5, 2022
1 parent ec84fd6 commit 498ac48
Show file tree
Hide file tree
Showing 24 changed files with 653 additions and 27 deletions.
99 changes: 96 additions & 3 deletions neural_compressor/adaptor/onnxrt_qdq.yaml
Expand Up @@ -87,7 +87,7 @@
int8: ['Conv', 'MatMul', 'Attention', 'Relu', 'Clip',
'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Squeeze',
'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose']
'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'Resize']
fp32: ['*'] # '*' means all op types

capabilities:
Expand All @@ -98,7 +98,7 @@

-
version:
name: ['1.10.0', '1.11.0']
name: '1.10.0'

precisions:
<<: *common_precisions
Expand All @@ -107,7 +107,7 @@
int8: ['Conv', 'MatMul', 'Attention', 'Relu', 'Clip',
'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Squeeze',
'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose']
'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'Resize']
fp32: ['*'] # '*' means all op types

capabilities:
Expand Down Expand Up @@ -174,6 +174,99 @@
graph_optimization:
<<: *default_optimization

-

version:
name: ['1.11.0', '1.12.0']

precisions:
<<: *common_precisions

ops:
int8: ['Conv', 'MatMul', 'Attention', 'Relu', 'Clip',
'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Squeeze', 'Reshape',
'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'Gemm', 'Resize']
fp32: ['*'] # '*' means all op types

capabilities:
int8: {
'Conv': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'granularity': ['per_channel', 'per_tensor'],
'algorithm': ['minmax']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
}
},
'Gather': {
'weight': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'algorithm': ['minmax'],
'granularity': ['per_channel', 'per_tensor'],
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'algorithm': ['minmax'],
'granularity': ['per_tensor'],
}
},
'MatMul': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'granularity': ['per_channel', 'per_tensor'],
'algorithm': ['minmax']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
}
},
'Gemm': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'granularity': ['per_channel', 'per_tensor'],
'algorithm': ['minmax']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
}
},
'default': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'algorithm': ['minmax'],
'granularity': ['per_tensor']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'algorithm': ['minmax'],
'granularity': ['per_tensor']
}
},
}


graph_optimization:
<<: *default_optimization

-

version:
Expand Down
125 changes: 123 additions & 2 deletions neural_compressor/adaptor/onnxrt_qlinear.yaml
Expand Up @@ -134,7 +134,7 @@
int8: ['Conv', 'MatMul', 'Attention', 'Mul', 'Relu', 'Clip',
'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Add', 'Squeeze',
'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose']
'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'Resize']
fp32: ['*'] # '*' means all op types

capabilities:
Expand Down Expand Up @@ -216,7 +216,7 @@
-

version:
name: ['1.9.0', '1.10.0', '1.11.0']
name: ['1.9.0', '1.10.0']

precisions:
<<: *common_precisions
Expand Down Expand Up @@ -314,6 +314,127 @@

graph_optimization:
<<: *default_optimization

-

version:
name: ['1.11.0', '1.12.0']

precisions:
<<: *common_precisions

ops:
int8: ['Conv', 'MatMul', 'Attention', 'Mul', 'Relu', 'Clip',
'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Add', 'Squeeze',
'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'ArgMax',
'Gemm', 'Resize']
fp32: ['*'] # '*' means all op types

capabilities:
int8: {
'FusedConv': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'granularity': ['per_channel', 'per_tensor'],
'algorithm': ['minmax']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
}
},
'Conv': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'granularity': ['per_channel', 'per_tensor'],
'algorithm': ['minmax']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
}
},
'Gather': {
'weight': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'algorithm': ['minmax'],
'granularity': ['per_channel', 'per_tensor'],
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'algorithm': ['minmax'],
'granularity': ['per_tensor'],
}
},
'MatMul': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'granularity': ['per_channel', 'per_tensor'],
'algorithm': ['minmax']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
}
},
'Gemm': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'granularity': ['per_channel', 'per_tensor'],
'algorithm': ['minmax']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
}
},
'EmbedLayerNormalization': {
'weight': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'algorithm': ['minmax'],
'granularity': ['per_tensor']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'algorithm': ['minmax'],
'granularity': ['per_tensor']
}
},
'default': {
'weight': {
'dtype': ['int8', 'fp32'],
'scheme': ['sym'],
'algorithm': ['minmax'],
'granularity': ['per_tensor']
},
'activation': {
'dtype': ['uint8', 'fp32'],
'scheme': ['asym'],
'algorithm': ['minmax'],
'granularity': ['per_tensor']
}
},
}

graph_optimization:
<<: *default_optimization
-
version:
name: 'default'
Expand Down
4 changes: 4 additions & 0 deletions neural_compressor/adaptor/ox_utils/onnxrt_mid.py
Expand Up @@ -134,6 +134,10 @@ def augment_graph(self, activation_only=False, weight_only=False):
if node.op_type == 'EmbedLayerNormalization' and len(node.output) > 1 and \
node.output[1] in tensors_to_dump:
tensors_to_dump.remove(node.output[1])
elif node.op_type == 'Resize':
tensors_to_dump.remove(node.input[1])
if len(node.input) == 4:
tensors_to_dump.remove(node.input[2])
elif weight_only:
for input in node.input:
if self.already_quantized and \
Expand Down
3 changes: 2 additions & 1 deletion neural_compressor/adaptor/ox_utils/operators/activation.py
Expand Up @@ -34,7 +34,8 @@ def convert(self):
if node.op_type in ['Relu', 'Clip']:
return

if len(self.quantizer.model.get_children(node)) == 0:
if len(self.quantizer.model.get_children(node)) == 0 or \
not node.name.endswith('_quant'):
return
# No assert on op_type as it is controlled by registry
# only try to quantize when given quantization parameters for it
Expand Down
31 changes: 31 additions & 0 deletions neural_compressor/adaptor/ox_utils/operators/argmax.py
@@ -0,0 +1,31 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from .base_operator import QuantOperatorBase

class QArgMax(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)

def convert(self):
node = self.node
origin_name = node.input[0].split('_argmax_node')[0]

if origin_name in self.quantizer.quantized_value_map:
node.input[0] = self.quantizer.quantized_value_map[origin_name].q_name
node.name = node.name + '_quant'
2 changes: 2 additions & 0 deletions neural_compressor/adaptor/ox_utils/operators/attention.py
Expand Up @@ -38,6 +38,8 @@ def convert(self):
'''
node = self.node
assert (node.op_type == "Attention")
if not node.name.endswith('_quant'):
return

parents = self.quantizer.model.get_parents(node)
quantized_name = []
Expand Down
8 changes: 5 additions & 3 deletions neural_compressor/adaptor/ox_utils/operators/binary_op.py
Expand Up @@ -29,11 +29,10 @@ def __init__(self, onnx_quantizer, onnx_node):

def convert(self):
node = self.node
if len(self.quantizer.model.get_children(node)) == 0:
if len(self.quantizer.model.get_children(node)) == 0 or \
not node.name.endswith('_quant'):
return
parents = self.quantizer.model.get_parents(node)
if all([i.op_type != 'DequantizeLinear' for i in parents]):
return
child = self.quantizer.model.get_children(node)[0]

qlinear_binary_math_output = child.output[0]
Expand Down Expand Up @@ -68,6 +67,9 @@ def quantize(self):
data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0])
if not data_found:
return

if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]):
return

self.quantizer.quantize_inputs(node, initializer_use_weight_qType=False)
if not self.disable_qdq_for_node_output or self.quantizer.mode != 'qdq':
Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/adaptor/ox_utils/operators/concat.py
Expand Up @@ -52,7 +52,7 @@ def convert(self):

parents = self.quantizer.model.get_parents(node)
children = self.quantizer.model.get_children(node)
if len(children) == 0 or len(parents) == 0:
if len(children) == 0 or len(parents) == 0 or not node.name.endswith('_quant'):
return

if all([i.op_type == 'DequantizeLinear' for i in parents]) and \
Expand Down
3 changes: 2 additions & 1 deletion neural_compressor/adaptor/ox_utils/operators/conv.py
Expand Up @@ -111,7 +111,8 @@ def convert(self):
node = self.node
assert (node.op_type in ["Conv", "FusedConv"])

if len(self.quantizer.model.get_children(node)) == 0:
if len(self.quantizer.model.get_children(node)) == 0 or \
not node.name.endswith('_quant'):
return
parents = self.quantizer.model.get_parents(node)
child = self.quantizer.model.get_children(node)[0]
Expand Down
3 changes: 2 additions & 1 deletion neural_compressor/adaptor/ox_utils/operators/direct_q8.py
Expand Up @@ -32,7 +32,8 @@ def convert(self):
node = self.node
parents = self.quantizer.model.get_parents(node)
children = self.quantizer.model.get_children(node)
if len(children) == 0 and len(parents) == 0:
if (len(children) == 0 and len(parents) == 0) or \
not node.name.endswith('_quant'):
return

if any([i.op_type == 'DequantizeLinear' for i in parents]) and \
Expand Down
Expand Up @@ -34,6 +34,8 @@ def __init__(self, onnx_quantizer, onnx_node):
def convert(self):
node = self.node
assert (node.op_type == "EmbedLayerNormalization")
if not node.name.endswith('_quant'):
return

'''
Pre-quantization EmbedLayerNorm inputs:
Expand Down

0 comments on commit 498ac48

Please sign in to comment.