Add fp16 ops for onnxrt adaptor (#915)

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
intel · May 31, 2023 · 15d5518 · 15d5518
1 parent 9ecc9f5
commit 15d5518
Show file tree

Hide file tree

Showing 13 changed files with 356 additions and 11 deletions.
diff --git a/neural_compressor/adaptor/onnxrt_cuda.yaml b/neural_compressor/adaptor/onnxrt_cuda.yaml
@@ -99,7 +99,12 @@
   }
   fp16: &common_fp16 ['Concat', 'Gather', 'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze',
     'EmbedLayerNormalization', 'Attention', 'Split', 'Sigmoid', 'Relu', 'Mul', 'Pad', 'MaxPool',
-    'MatMul', 'LeakyRelu',  'GlobalAveragePool', 'Gemm', 'Conv', 'AveragePool', 'Add', 'Clip']
+    'MatMul', 'LeakyRelu',  'GlobalAveragePool', 'Gemm', 'Conv', 'AveragePool', 'Add', 'Clip',
+    'BatchNormalization', 'Softmax', 'Sum', 'Abs', 'BiasGelu', 'Exp', 'FastGelu',
+    'Gelu', 'Log', 'Round', 'Sigmoid', 'Sqrt', 'Tanh', 'Sub', 'Mul', 'Div', 'Pow',
+    'ReduceMean', 'Equal', 'FusedMatMul', 'Greater', 'GreaterOrEqual', 'Less', 'LessOrEqual',
+    'ReduceL1', 'ReduceL2', 'ReduceLogSum', 'ReduceLogSumExp', 'ReduceMax', 'ReduceProd',
+    'ReduceSum', 'ReduceSumSquare', 'LayerNormalization', 'Concat']
   bf16: &common_bf16 ['Concat', 'Gather', 'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze',
     'Split', 'Sigmoid', 'Relu', 'Mul', 'MatMul', 'Gemm', 'Add']
   recipes: &default_optimization

diff --git a/neural_compressor/adaptor/ox_utils/operators/activation.py b/neural_compressor/adaptor/ox_utils/operators/activation.py
@@ -138,4 +138,12 @@ def convert(self):
             node.op_type.split('QLinear')[-1], inputs,
             outputs, node.name + '_convert', **kwargs)
         add_nodes.append(activation_node)
-        return True, add_nodes, inits
+        return True, add_nodes, inits
+
+@op_registry(op_types="Softmax, BiasGelu, Elu, Exp, FastGelu, Gelu, Softplus, Tanh")
+class Float16ActivationOperator(Operator):
+    """Float16 Activation operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(Float16ActivationOperator, self).__init__(onnx_quantizer, onnx_node)
diff --git a/neural_compressor/adaptor/ox_utils/operators/binary_op.py b/neural_compressor/adaptor/ox_utils/operators/binary_op.py
@@ -134,3 +134,11 @@ def convert(self):
             outputs, node.name + '_convert', **kwargs)
         add_nodes.append(binary_node)
         return True, add_nodes, inits
+
+@op_registry(op_types="Sum, Sub, Div, Pow, Equal, Greater, GreaterOrEqual, Less, LessOrEqual")
+class Float16BinaryOperator(Operator):
+    """Float16 Binary operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(Float16BinaryOperator, self).__init__(onnx_quantizer, onnx_node)
diff --git a/neural_compressor/adaptor/ox_utils/operators/concat.py b/neural_compressor/adaptor/ox_utils/operators/concat.py
@@ -101,9 +101,11 @@ def convert(self, convert_format):
     def cast(self): # pragma: no cover
         """Cast node."""
         node = self.node
-        if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]:
+        cast_tensor = [i.tensor_name for i in self.quantizer.new_value_info.values()]
+        if not all([i in cast_tensor for i in node.input]):
             return
-        self.quantizer.dtype_cast(self.node, self.dtype)
+        self.quantizer.cast_inputs(self.node, self.dtype)
+        self.quantizer.cast_outputs(self.node, self.dtype)
 
 @qop_registry(op_types="QLinearConcat")
 class QConcatOperator(QOperator):

diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py
@@ -79,7 +79,8 @@ def cast(self): # pragma: no cover
         node = self.node
         if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]:
             return
-        self.quantizer.dtype_cast(self.node, self.dtype)
+        self.quantizer.cast_inputs(self.node, self.dtype, [0])
+        self.quantizer.cast_outputs(self.node, self.dtype)
 
 @qop_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze")
 class QDirectOperator(QOperator):

diff --git a/neural_compressor/adaptor/ox_utils/operators/matmul.py b/neural_compressor/adaptor/ox_utils/operators/matmul.py
@@ -195,3 +195,11 @@ def convert(self):
             outputs, node.name + '_convert', **kwargs)
         add_nodes.append(matmul_node)
         return True, add_nodes, inits
+
+@op_registry(op_types="FusedMatMul")
+class FusedMatMulOperator(Operator):
+    """FusedMatMul Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(FusedMatMulOperator, self).__init__(onnx_quantizer, onnx_node)
diff --git a/neural_compressor/adaptor/ox_utils/operators/norm.py b/neural_compressor/adaptor/ox_utils/operators/norm.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalization Operator."""
+
+import onnx
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator, QOperator, qop_registry
+from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg, ms_domain
+
+@op_registry(op_types="BatchNormalization")
+class BatchNormalizationOperator(Operator):
+    """BatchNormalization Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(BatchNormalizationOperator, self).__init__(onnx_quantizer, onnx_node)
+
+    def cast(self):
+        """Cast node."""
+        if self.dtype == 'bf16':
+            self.quantizer.cast_inputs(self.node, self.dtype, [0])
+        else:
+            self.quantizer.cast_inputs(self.node, self.dtype)
+        self.quantizer.cast_outputs(self.node, self.dtype)
+
+@op_registry(op_types="LayerNormalization")
+class NormalizationOperator(Operator):
+    """Normalization Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(NormalizationOperator, self).__init__(onnx_quantizer, onnx_node)
diff --git a/neural_compressor/adaptor/ox_utils/operators/ops.py b/neural_compressor/adaptor/ox_utils/operators/ops.py
@@ -107,7 +107,8 @@ def convert(self, convert_format):
 
     def cast(self): # pragma: no cover
         """Cast node."""
-        self.quantizer.dtype_cast(self.node, self.dtype)
+        self.quantizer.cast_inputs(self.node, self.dtype)
+        self.quantizer.cast_outputs(self.node, self.dtype)
 
 class QOperator(object):
     """Base QOperator."""

diff --git a/neural_compressor/adaptor/ox_utils/operators/reduce.py b/neural_compressor/adaptor/ox_utils/operators/reduce.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Reduce Operator."""
+
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
+
+@op_registry(op_types="ReduceMean, ReduceLogSum, ReduceLogSumExp, ReduceMax, " \
+    "ReduceL1, ReduceL2, ReduceProd, ReduceSum, ReduceSumSquare")
+class ReduceOperator(Operator):
+    """Reduce Operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(ReduceOperator, self).__init__(onnx_quantizer, onnx_node)
diff --git a/neural_compressor/adaptor/ox_utils/operators/split.py b/neural_compressor/adaptor/ox_utils/operators/split.py
@@ -87,7 +87,8 @@ def cast(self): # pragma: no cover
         node = self.node
         if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]:
             return
-        self.quantizer.dtype_cast(self.node, self.dtype)
+        self.quantizer.cast_inputs(self.node, self.dtype)
+        self.quantizer.cast_outputs(self.node, self.dtype)
 
 @qop_registry(op_types="Split")
 class QSplitOperator(QOperator):

diff --git a/neural_compressor/adaptor/ox_utils/operators/unary_op.py b/neural_compressor/adaptor/ox_utils/operators/unary_op.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unary operator."""
+
+from neural_compressor.adaptor.ox_utils.operators.ops import op_registry, Operator
+
+@op_registry(op_types="Abs, Exp, Log, Round, Sqrt")
+class UnaryOperator(Operator):
+    """Unary operator."""
+
+    def __init__(self, onnx_quantizer, onnx_node):
+        """Initialization."""
+        super(UnaryOperator, self).__init__(onnx_quantizer, onnx_node)
diff --git a/neural_compressor/adaptor/ox_utils/quantizer.py b/neural_compressor/adaptor/ox_utils/quantizer.py
@@ -372,9 +372,11 @@ def dfs(match_nodes, node, pattern):
             self.model.replace_node_input(node, old_input_name, new_input_name)
         self.model.update()
 
-    def dtype_cast(self, node, cfg, keep_io_types=True): # pragma: no cover
-        """Cast node dtype."""
+    def cast_inputs(self, node, cfg, indices=None):
+        """Cast node input dtype."""
         for idx, tensor_name in enumerate(node.input):
+            if indices and idx not in indices:
+                continue
             initializer = find_by_name(tensor_name, self.model.initializer())
             if initializer is not None:
                 if initializer.data_type != onnx_proto.TensorProto.FLOAT: 
@@ -394,10 +396,12 @@ def dtype_cast(self, node, cfg, keep_io_types=True): # pragma: no cover
                 node.input[idx] = name
                 self.new_value_info[name] = ValueInfo(tensor_name,
                                                              TensorProto.FLOAT, dtype_mapping[cfg])
-        if all([i not in self.new_value_info for i in node.input]):
-            return
 
+    def cast_outputs(self, node, cfg, indices=None):
+        """Cast node output dtype."""
         for idx, tensor_name in enumerate(node.output):
+            if indices and idx not in indices:
+                continue
             if tensor_name in self.value_infos and \
                 self.value_infos[tensor_name].type.HasField('tensor_type') and \
                 self.value_infos[tensor_name].type.tensor_type.elem_type != TensorProto.FLOAT: