fix quantization from fp64 to fp32 (#1153)

intel · Aug 17, 2022 · cb7b485 · cb7b485
1 parent 15a05fd
commit cb7b485
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py
@@ -155,11 +155,12 @@ def quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point):
             - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
                 m = max(abs(rmin), abs(rmax))
     '''
+    data = np.asarray(data)
     if qType == onnx_proto.TensorProto.INT8 and scheme == 'sym':
         # signed byte type
-        quantized_data = (np.asarray(data) / scale).round().astype('b')
+        quantized_data = (data.astype(np.float32) / scale).round().astype('b')
     elif qType == onnx_proto.TensorProto.UINT8 and scheme == 'asym':
-        quantized_data = ((np.asarray(data) / scale).round() + zero_point).astype('B')
+        quantized_data = ((data.astype(np.float32) / scale).round() + zero_point).astype('B')
     else:
         raise ValueError("Unexpected combination of data type {} and scheme {}.".format(
                                                                         qType, scheme))