Export Qlinear to QDQ (#224)

Signed-off-by: mengniwa <mengni.wang@intel.com>
intel · Dec 12, 2022 · e996a93 · e996a93
1 parent 40ab5a3
commit e996a93
Show file tree

Hide file tree

Showing 24 changed files with 1,390 additions and 35 deletions.
diff --git a/neural_compressor/adaptor/ox_utils/operators/__init__.py b/neural_compressor/adaptor/ox_utils/operators/__init__.py
@@ -18,12 +18,12 @@
 
 from os.path import dirname, basename, isfile, join
 import glob
-from .ops import OPERATORS
+from .ops import OPERATORS, QOPERATORS
 
 modules = glob.glob(join(dirname(__file__), "*.py"))
 
 for f in modules:
     if isfile(f) and not f.startswith('__') and not f.endswith('__init__.py'):
         __import__(basename(f)[:-3], globals(), locals(), level=1)
 
-__all__ = ["OPERATORS"]
+__all__ = ["OPERATORS", "QOPERATORS"]
diff --git a/neural_compressor/adaptor/ox_utils/operators/activation.py b/neural_compressor/adaptor/ox_utils/operators/activation.py
@@ -17,7 +17,7 @@
 #
 
 import onnx
-from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry
 from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain
 
 @op_registry(op_types="LeakyRelu, Sigmoid")
@@ -87,4 +87,40 @@ def quantize(self):
             self.quantizer.dequantize_tensor(node, node.input[0])
         else:
             self.quantizer.model.replace_input_of_all_nodes(node.output[0], node.input[0])
-            self.quantizer.remove_nodes.append(node)
+            self.quantizer.remove_nodes.append(node)
+
+@qop_registry(op_types="QLinearLeakyRelu, QLinearSigmoid")
+class QActivationOperator(QOperator):
+    def __init__(self, onnx_node, children, initializers):
+        super().__init__(onnx_node, children, initializers)
+
+    def convert(self):
+        node = self.node
+        add_nodes = []
+        inits = []
+        # input dq
+        in_dq = onnx.helper.make_node(
+            'DequantizeLinear',
+            node.input[:3],
+            [node.name + '_in_dequant'],
+            node.name + '_in_dequant')
+        inputs = [node.name + '_in_dequant']
+        add_nodes.append(in_dq)
+        # output q
+        out_q = onnx.helper.make_node(
+            'QuantizeLinear',
+            [node.name + '_out', node.input[3], node.input[4]],
+            node.output,
+            node.name + '_out_quant')
+        outputs = [node.name + '_out']
+        add_nodes.append(out_q)
+
+        kwargs = {}
+        for attribute in node.attribute: # pragma: no cover
+            kwargs.update(attribute_to_kwarg(attribute))
+
+        activation_node = onnx.helper.make_node(
+            node.op_type.split('QLinear')[-1], inputs,
+            outputs, node.name + '_convert', **kwargs)
+        add_nodes.append(activation_node)
+        return True, add_nodes, inits
diff --git a/neural_compressor/adaptor/ox_utils/operators/argmax.py b/neural_compressor/adaptor/ox_utils/operators/argmax.py
@@ -15,9 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-
-from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry
 
 @op_registry(op_types="ArgMax")
 class ArgMaxOperator(Operator):
@@ -35,5 +33,9 @@ def convert(self, convert_format):
         origin_name = node.input[0].split('_argmax_node')[0]
 
         if origin_name in self.quantizer.quantized_value_map:
-            node.input[0] = self.quantizer.quantized_value_map[origin_name].q_name
-            node.name = node.name + '_quant'
+            node.name = node.name + '_quant'
+
+@qop_registry(op_types="ArgMax")
+class QArgMaxOperator(QOperator):
+    def __init__(self, onnx_node, children, initializers):
+        super().__init__(onnx_node, children, initializers)
diff --git a/neural_compressor/adaptor/ox_utils/operators/attention.py b/neural_compressor/adaptor/ox_utils/operators/attention.py
@@ -17,8 +17,8 @@
 #
 
 import onnx
-from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
-from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, qop_registry, QOperator
+from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain, find_by_name
 
 @op_registry(op_types="Attention")
 class AttentionOperator(Operator):
@@ -74,3 +74,46 @@ def convert(self, convert_format):
         self.quantizer.new_nodes.append(qattention_node)
 
         self.quantizer.remove_nodes.append(node)
+
+@qop_registry(op_types="QAttention")
+class QAttentionOperator(QOperator):
+    def __init__(self, onnx_node, children, initializers):
+        super().__init__(onnx_node, children, initializers)
+
+    def convert(self):
+        node = self.node
+        add_nodes = []
+        inputs = []
+        inits = []
+        if find_by_name(node.input[3], self.initializers) is None:
+            return False, add_nodes, inits
+        # input dq
+        in_dq1 = onnx.helper.make_node(
+            'DequantizeLinear',
+            [node.input[0], node.input[3], node.input[6]],
+            [node.name + '_in_dequant1'],
+            node.name + '_in_dequant1')
+
+        in_dq2 = onnx.helper.make_node(
+            'DequantizeLinear',
+            [node.input[1], node.input[4], node.input[7]],
+            [node.name + '_in_dequant2'],
+            node.name + '_in_dequant2')
+        inputs = [node.name + '_in_dequant1',
+                  node.name + '_in_dequant2',
+                  node.input[2],
+                  node.input[5]]
+
+        add_nodes.extend([in_dq1, in_dq2])
+
+        outputs = node.output
+        kwargs = {}
+        for attribute in node.attribute: # pragma: no cover
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        binary_node = onnx.helper.make_node(
+            'Attention', inputs,
+            outputs, node.name + '_convert', **kwargs)
+        add_nodes.append(binary_node)
+        return True, add_nodes, inits
diff --git a/neural_compressor/adaptor/ox_utils/operators/binary_op.py b/neural_compressor/adaptor/ox_utils/operators/binary_op.py
@@ -17,7 +17,7 @@
 #
 
 import onnx
-from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry
 from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain
 
 @op_registry(op_types="Add, Mul")
@@ -77,4 +77,47 @@ def convert(self, convert_format):
         self.quantizer.new_nodes += [qlinear_binary_math_node]
         self.quantizer.remove_nodes.extend(parents)
         self.quantizer.remove_nodes.append(child)
-        self.quantizer.remove_nodes.append(node)
+        self.quantizer.remove_nodes.append(node)
+
+@qop_registry(op_types="QLinearAdd, QLinearMul")
+class QBinaryOperator(QOperator):
+    def __init__(self, onnx_node, children, initializers):
+        super().__init__(onnx_node, children, initializers)
+
+    def convert(self):
+        node = self.node
+        add_nodes = []
+        inits = []
+        # input dq
+        in_dq1 = onnx.helper.make_node(
+            'DequantizeLinear',
+            node.input[:3],
+            [node.name + '_in_dequant1'],
+            node.name + '_in_dequant1')
+
+        in_dq2 = onnx.helper.make_node(
+            'DequantizeLinear',
+            node.input[3:6],
+            [node.name + '_in_dequant2'],
+            node.name + '_in_dequant2')
+        inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2']
+
+        add_nodes.extend([in_dq1, in_dq2])
+        # output q
+        out_q = onnx.helper.make_node(
+            'QuantizeLinear',
+            [node.name + '_out', node.input[6], node.input[7]],
+            node.output,
+            node.name + '_out_quant')
+        outputs = [node.name + '_out']
+        add_nodes.append(out_q)
+
+        kwargs = {}
+        for attribute in node.attribute: # pragma: no cover
+            kwargs.update(attribute_to_kwarg(attribute))
+
+        binary_node = onnx.helper.make_node(
+            node.op_type.split('QLinear')[-1], inputs,
+            outputs, node.name + '_convert', **kwargs)
+        add_nodes.append(binary_node)
+        return True, add_nodes, inits
diff --git a/neural_compressor/adaptor/ox_utils/operators/concat.py b/neural_compressor/adaptor/ox_utils/operators/concat.py
@@ -17,7 +17,7 @@
 #
 
 import onnx
-from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry
 from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain
 
 @op_registry(op_types="Concat")
@@ -96,3 +96,42 @@ def cast(self): # pragma: no cover
         if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]:
             return
         self.quantizer.dtype_cast(self.node, self.dtype)
+
+@qop_registry(op_types="QLinearConcat")
+class QConcatOperator(QOperator):
+    def __init__(self, onnx_node, children, initializers):
+        super().__init__(onnx_node, children, initializers)
+
+    def convert(self):
+        node = self.node
+        add_nodes = []
+        inputs = []
+        inits = []
+        # input dq
+        for i in range(int((len(node.input) - 2) / 3 - 1)):
+            in_dq = onnx.helper.make_node(
+                'DequantizeLinear',
+                node.input[2 + i*3 : 2 + (i+1)*3],
+                [node.name + '_in_dequant_' + str(i)],
+                node.name + '_in_dequant_' + str(i))
+            inputs.append(node.name + '_in_dequant_' + str(i))
+            add_nodes.append(in_dq)
+
+        # output q
+        out_q = onnx.helper.make_node(
+            'QuantizeLinear',
+            [node.name + '_out', node.input[0], node.input[1]],
+            node.output,
+            node.name + '_out_quant')
+        outputs = [node.name + '_out']
+        add_nodes.append(out_q)
+
+        kwargs = {}
+        for attribute in node.attribute: # pragma: no cover
+            kwargs.update(attribute_to_kwarg(attribute))
+
+        concat_node = onnx.helper.make_node(
+            'Concat', inputs,
+            outputs, node.name + '_convert', **kwargs)
+        add_nodes.append(concat_node)
+        return True, add_nodes, inits
diff --git a/neural_compressor/adaptor/ox_utils/operators/conv.py b/neural_compressor/adaptor/ox_utils/operators/conv.py
@@ -19,7 +19,7 @@
 
 import onnx
 from onnx import onnx_pb as onnx_proto
-from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry
 from neural_compressor.adaptor.ox_utils.util import find_by_name, attribute_to_kwarg
 
 @op_registry(op_types="Conv, FusedConv")
@@ -156,6 +156,7 @@ def convert(self, convert_format):
                 if attribute.name == 'activation_params': # pragma: no cover
                     continue
                 kwargs.update(attribute_to_kwarg(attribute))
+
             qlinear_conv_node = onnx.helper.make_node("QLinearConv", qlinear_conv_inputs, 
                                                     [qlinear_conv_output],
                                                     node.name, **kwargs)
@@ -164,4 +165,71 @@ def convert(self, convert_format):
             self.quantizer.remove_nodes.append(child)
             self.quantizer.remove_nodes.append(node)
 
+@qop_registry(op_types="QLinearConv")
+class QConvOperator(QOperator):
+    def __init__(self, onnx_node, children, initializers):
+        super().__init__(onnx_node, children, initializers)
 
+    def convert(self):
+        node = self.node
+        add_nodes = []
+        inits = []
+        # input dq
+        in_dq1 = onnx.helper.make_node(
+            'DequantizeLinear',
+            node.input[:3],
+            [node.name + '_in_dequant1'],
+            node.name + '_in_dequant1')
+
+        in_dq2 = onnx.helper.make_node(
+            'DequantizeLinear',
+            node.input[3:6],
+            [node.name + '_in_dequant2'],
+            node.name + '_in_dequant2')
+
+        add_nodes.extend([in_dq1, in_dq2])
+        inputs = [node.name + '_in_dequant1', node.name + '_in_dequant2']
+        if len(node.input) == 9:
+            import numpy as np
+            input_scale = onnx.numpy_helper.to_array(
+                find_by_name(node.input[1], self.initializers))
+            weight_scale = onnx.numpy_helper.to_array(
+                find_by_name(node.input[4], self.initializers))
+            bias_scale = input_scale * weight_scale
+
+            # update scale initializer
+            bias_scale_data = np.asarray(bias_scale, dtype=np.float32).reshape(-1)
+            bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data,
+                                                             node.input[8] + '_scale')
+            inits.extend([bias_scale_initializer])
+
+            # update zero initializer
+            bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
+            bias_zp_initializer = onnx.numpy_helper.from_array(
+                bias_zp_data, node.input[8] + '_zero_point')
+            inits.extend([bias_zp_initializer])
+            in_dq3 = onnx.helper.make_node(
+                'DequantizeLinear',
+                [node.input[8], bias_scale_initializer.name, bias_zp_initializer.name],
+                [node.name + '_in_dequant3'],
+                node.name + '_in_dequant3')
+            inputs.append(in_dq3.name)
+            add_nodes.append(in_dq3)
+        # output q
+        out_q = onnx.helper.make_node(
+            'QuantizeLinear',
+            [node.name + '_out', node.input[6], node.input[7]],
+            node.output,
+            node.name + '_out_quant')
+        outputs = [node.name + '_out']
+        add_nodes.append(out_q)
+
+        kwargs = {}
+        for attribute in node.attribute: # pragma: no cover
+            kwargs.update(attribute_to_kwarg(attribute))
+
+        binary_node = onnx.helper.make_node(
+            node.op_type.split('QLinear')[-1], inputs,
+            outputs, node.name + '_convert', **kwargs)
+        add_nodes.append(binary_node)
+        return True, add_nodes, inits
diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 #
 
-from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, qop_registry, QOperator
 
 @op_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze")
 class Direct8BitOperator(Operator):
@@ -83,3 +83,8 @@ def cast(self):
         if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]:
             return
         self.quantizer.dtype_cast(self.node, self.dtype)
+
+@qop_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze")
+class QDirectOperator(QOperator):
+    def __init__(self, onnx_node, children, initializers):
+        super().__init__(onnx_node, children, initializers)