Support ORT1.12 (#1203)

intel · Sep 5, 2022 · 498ac48 · 498ac48
1 parent ec84fd6
commit 498ac48
Show file tree

Hide file tree

Showing 24 changed files with 653 additions and 27 deletions.
diff --git a/neural_compressor/adaptor/onnxrt_qdq.yaml b/neural_compressor/adaptor/onnxrt_qdq.yaml
@@ -87,7 +87,7 @@
     int8: ['Conv', 'MatMul', 'Attention', 'Relu', 'Clip',
            'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
            'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Squeeze',
-           'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose']
+           'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'Resize']
     fp32: ['*'] # '*' means all op types
 
   capabilities:
@@ -98,7 +98,7 @@
 
 -
   version:
-    name: ['1.10.0', '1.11.0']
+    name: '1.10.0'
 
   precisions: 
     <<: *common_precisions
@@ -107,7 +107,7 @@
     int8: ['Conv', 'MatMul', 'Attention', 'Relu', 'Clip',
         'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
         'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Squeeze',
-        'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose']
+        'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'Resize']
     fp32: ['*'] # '*' means all op types
 
   capabilities:
@@ -174,6 +174,99 @@
   graph_optimization: 
     <<: *default_optimization
 
+-
+
+  version:
+    name: ['1.11.0', '1.12.0']
+
+  precisions: 
+    <<: *common_precisions
+
+  ops:
+    int8: ['Conv', 'MatMul', 'Attention', 'Relu', 'Clip',
+        'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
+        'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Squeeze', 'Reshape',
+        'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'Gemm', 'Resize']
+    fp32: ['*'] # '*' means all op types
+
+  capabilities:
+    int8: {
+          'Conv': {
+            'weight':   {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        'algorithm': ['minmax']
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'granularity': ['per_tensor'],
+                        'algorithm': ['minmax']
+                        }
+                    },
+          'Gather': {
+            'weight':   {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_tensor'],
+                        }
+                    },
+          'MatMul': {
+            'weight':   {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        'algorithm': ['minmax']
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'granularity': ['per_tensor'],
+                        'algorithm': ['minmax']
+                        }
+                    },
+          'Gemm': {
+            'weight':   {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        'algorithm': ['minmax']
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'granularity': ['per_tensor'],
+                        'algorithm': ['minmax']
+                        }
+                    },
+         'default': {
+             'weight': {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_tensor']
+                    },
+             'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_tensor']
+                        }
+                    },
+          }
+
+
+  graph_optimization: 
+    <<: *default_optimization
+
 -
 
   version:

diff --git a/neural_compressor/adaptor/onnxrt_qlinear.yaml b/neural_compressor/adaptor/onnxrt_qlinear.yaml
@@ -134,7 +134,7 @@
     int8: ['Conv', 'MatMul', 'Attention', 'Mul', 'Relu', 'Clip',
         'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
         'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Add', 'Squeeze',
-        'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose']
+        'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'Resize']
     fp32: ['*'] # '*' means all op types
 
   capabilities:
@@ -216,7 +216,7 @@
 -
 
   version:
-    name: ['1.9.0', '1.10.0', '1.11.0']
+    name: ['1.9.0', '1.10.0']
 
   precisions: 
     <<: *common_precisions
@@ -314,6 +314,127 @@
 
   graph_optimization: 
     <<: *default_optimization
+
+-
+
+  version:
+    name: ['1.11.0', '1.12.0']
+
+  precisions: 
+    <<: *common_precisions
+
+  ops:
+    int8: ['Conv', 'MatMul', 'Attention', 'Mul', 'Relu', 'Clip',
+        'LeakyRelu', 'Gather', 'Sigmoid', 'MaxPool', 'EmbedLayerNormalization',
+        'FusedConv', 'GlobalAveragePool', 'Pad', 'Split', 'Add', 'Squeeze',
+        'Reshape', 'Concat', 'AveragePool', 'Unsqueeze', 'Transpose', 'ArgMax',
+        'Gemm', 'Resize']
+    fp32: ['*'] # '*' means all op types
+
+  capabilities:
+    int8: {
+          'FusedConv': {
+            'weight': {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        'algorithm': ['minmax']
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'granularity': ['per_tensor'],
+                        'algorithm': ['minmax']
+                        }
+                    },
+          'Conv': {
+            'weight':   {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        'algorithm': ['minmax']
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'granularity': ['per_tensor'],
+                        'algorithm': ['minmax']
+                        }
+                    },
+          'Gather': {
+            'weight':   {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_tensor'],
+                        }
+                    },
+          'MatMul': {
+            'weight':   {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        'algorithm': ['minmax']
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'granularity': ['per_tensor'],
+                        'algorithm': ['minmax']
+                        }
+                    },
+          'Gemm': {
+            'weight':   {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'granularity': ['per_channel', 'per_tensor'],
+                        'algorithm': ['minmax']
+                        },
+            'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'granularity': ['per_tensor'],
+                        'algorithm': ['minmax']
+                        }
+                    },
+          'EmbedLayerNormalization': {
+             'weight': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_tensor']
+                    },
+             'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_tensor']
+                        }
+                    },
+          'default': {
+             'weight': {
+                        'dtype': ['int8', 'fp32'],
+                        'scheme': ['sym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_tensor']
+                    },
+             'activation': {
+                        'dtype': ['uint8', 'fp32'],
+                        'scheme': ['asym'],
+                        'algorithm': ['minmax'],
+                        'granularity': ['per_tensor']
+                        }
+                    },
+          }
+
+  graph_optimization: 
+    <<: *default_optimization
 -
   version:
     name: 'default'

diff --git a/neural_compressor/adaptor/ox_utils/onnxrt_mid.py b/neural_compressor/adaptor/ox_utils/onnxrt_mid.py
@@ -134,6 +134,10 @@ def augment_graph(self, activation_only=False, weight_only=False):
                     if node.op_type == 'EmbedLayerNormalization' and len(node.output) > 1 and \
                         node.output[1] in tensors_to_dump:
                         tensors_to_dump.remove(node.output[1])
+                    elif node.op_type == 'Resize':
+                        tensors_to_dump.remove(node.input[1])
+                        if len(node.input) == 4:
+                            tensors_to_dump.remove(node.input[2])
                 elif weight_only:
                     for input in node.input:
                         if self.already_quantized and \

diff --git a/neural_compressor/adaptor/ox_utils/operators/activation.py b/neural_compressor/adaptor/ox_utils/operators/activation.py
@@ -34,7 +34,8 @@ def convert(self):
         if node.op_type in ['Relu', 'Clip']:
             return
 
-        if len(self.quantizer.model.get_children(node)) == 0:
+        if len(self.quantizer.model.get_children(node)) == 0 or \
+            not node.name.endswith('_quant'):
             return
         # No assert on op_type as it is controlled by registry
         # only try to quantize when given quantization parameters for it

diff --git a/neural_compressor/adaptor/ox_utils/operators/argmax.py b/neural_compressor/adaptor/ox_utils/operators/argmax.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .base_operator import QuantOperatorBase
+
+class QArgMax(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def convert(self):
+        node = self.node
+        origin_name = node.input[0].split('_argmax_node')[0]
+
+        if origin_name in self.quantizer.quantized_value_map:
+            node.input[0] = self.quantizer.quantized_value_map[origin_name].q_name
+            node.name = node.name + '_quant'
diff --git a/neural_compressor/adaptor/ox_utils/operators/attention.py b/neural_compressor/adaptor/ox_utils/operators/attention.py
@@ -38,6 +38,8 @@ def convert(self):
         '''
         node = self.node
         assert (node.op_type == "Attention")
+        if not node.name.endswith('_quant'):
+            return
 
         parents = self.quantizer.model.get_parents(node)
         quantized_name = []

diff --git a/neural_compressor/adaptor/ox_utils/operators/binary_op.py b/neural_compressor/adaptor/ox_utils/operators/binary_op.py
@@ -29,11 +29,10 @@ def __init__(self, onnx_quantizer, onnx_node):
 
     def convert(self):
         node = self.node
-        if len(self.quantizer.model.get_children(node)) == 0:
+        if len(self.quantizer.model.get_children(node)) == 0 or \
+            not node.name.endswith('_quant'):
             return
         parents = self.quantizer.model.get_parents(node)
-        if all([i.op_type != 'DequantizeLinear' for i in parents]):
-            return
         child = self.quantizer.model.get_children(node)[0]
 
         qlinear_binary_math_output = child.output[0]
@@ -68,6 +67,9 @@ def quantize(self):
         data_found, _, _, _, _ = self.quantizer._get_quantization_params(node.output[0])
         if not data_found:
             return
+
+        if not all([self.quantizer.is_valid_quantize_weight(i) for i in node.input]):
+            return
 
         self.quantizer.quantize_inputs(node, initializer_use_weight_qType=False)
         if not self.disable_qdq_for_node_output or self.quantizer.mode != 'qdq':

diff --git a/neural_compressor/adaptor/ox_utils/operators/concat.py b/neural_compressor/adaptor/ox_utils/operators/concat.py
@@ -52,7 +52,7 @@ def convert(self):
 
         parents = self.quantizer.model.get_parents(node)
         children = self.quantizer.model.get_children(node)
-        if len(children) == 0 or len(parents) == 0:
+        if len(children) == 0 or len(parents) == 0 or not node.name.endswith('_quant'):
             return
 
         if all([i.op_type == 'DequantizeLinear' for i in parents]) and \

diff --git a/neural_compressor/adaptor/ox_utils/operators/conv.py b/neural_compressor/adaptor/ox_utils/operators/conv.py
@@ -111,7 +111,8 @@ def convert(self):
         node = self.node
         assert (node.op_type in ["Conv", "FusedConv"])
 
-        if len(self.quantizer.model.get_children(node)) == 0:
+        if len(self.quantizer.model.get_children(node)) == 0 or \
+            not node.name.endswith('_quant'):
             return
         parents = self.quantizer.model.get_parents(node)
         child = self.quantizer.model.get_children(node)[0]

diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py
@@ -32,7 +32,8 @@ def convert(self):
         node = self.node
         parents = self.quantizer.model.get_parents(node)
         children = self.quantizer.model.get_children(node)
-        if len(children) == 0 and len(parents) == 0:
+        if (len(children) == 0 and len(parents) == 0) or \
+            not node.name.endswith('_quant'):
             return
 
         if any([i.op_type == 'DequantizeLinear' for i in parents]) and \

diff --git a/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py b/neural_compressor/adaptor/ox_utils/operators/embed_layernorm.py
@@ -34,6 +34,8 @@ def __init__(self, onnx_quantizer, onnx_node):
     def convert(self):
         node = self.node
         assert (node.op_type == "EmbedLayerNormalization")
+        if not node.name.endswith('_quant'):
+            return
 
         '''
         Pre-quantization EmbedLayerNorm inputs: