larq · lgeiger · May 5, 2021 · Apr 15, 2021 · Apr 16, 2021 · Apr 16, 2021
diff --git a/larq/quantizers.py b/larq/quantizers.py
@@ -558,47 +558,133 @@ class DoReFa(_BaseQuantizer):
     0 & \text{else}
     \end{cases}\\]
 
+    The behavior for quantizing weights should be different in comparison to
+    the quantization of activations:
+    instead of limiting input operands (or in this case: weights) using a hard
+    limiter, a tangens hyperbolicus is applied to achieve a softer limiting
+    with a gradient, which is continuously differentiable itself.
+
+    \\[
+    w_{lim}(w) = \tanh(w)
+    \\]
+
+    Furthermore, the weights of each layer are normed, such that the weight with
+    the largest magnitude gets the largest or smallest (depending on its sign)
+    quantizable value. That way, the full quantizable numeric range is utilized.
+
+    \\[
+    w_{norm}(w) = \frac{w}{\max(|w|)}
+    \\]
+
+    The formulas can be found in the paper in section 2.3. Please note, that
+    the paper refers to weights being quantized on a numeric range of [-1, 1], while
+    activations are quantized on the numeric range [0, 1]. This implementation
+    uses the same ranges as specified in the paper.
+
+    The activation quantizer defines the function quantizek() from the paper with
+    the correct numeric range of [0, 1]. The weight quantization mode adds
+    pre- and post-processing for numeric range adaptions, soft limiting and
+    norming. The full quantization function including the adaption of numeric ranges is
+
+    \\[
+    q(w) = 2 \, quantize_{k}(\frac{w_{norm}\left(w_{lim}\left(w\right)\right)}{2} + \frac{1}{2}) - 1
+    \\]
+
     !!! warning
-        While the DoReFa paper describes how to do quantization for both weights and
-        activations, this implementation is only valid for activations, and this
-        quantizer should therefore not be used as a kernel quantizer.
+        The weight mode works for weights on the range [-1, 1], which matches the
+        default setting of `constraints.weight_clip`. Do not use this quantizer
+        with a different constraint `clip_value` than the default one.
 
     ```plot-activation
     quantizers.DoReFa
     ```
 
     # Arguments
         k_bit: number of bits for the quantization.
+        mode: `"activations"` for clipping inputs on [0, 1] range or `"weights"` for
+            soft-clipping and norming weights on [-1, 1] range before applying
+            quantization.
         metrics: An array of metrics to add to the layer. If `None` the metrics set in
             `larq.context.metrics_scope` are used. Currently only the `flip_ratio`
             metric is available.
 
     # Returns
         Quantization function
 
+    # Raises
+        ValueError for bad value of `mode`.
+
     # References
         - [DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low
             Bitwidth Gradients](https://arxiv.org/abs/1606.06160)
     """
     precision = None
 
-    def __init__(self, k_bit: int = 2, **kwargs):
+    def __init__(self, k_bit: int = 2, mode: str = "activations", **kwargs):
         self.precision = k_bit
+
+        if mode not in ("activations", "weights"):
+            raise ValueError(
+                f"Invalid DoReFa quantizer mode {mode}. "
+                "Valid values are 'activations' and 'weights'."
+            )
+        self.mode = mode
+
         super().__init__(**kwargs)
 
+    def weight_preprocess(self, inputs):
+        # Limit inputs to [-1, 1] range
+        limited = tf.math.tanh(inputs)
+
+        # Divider for max-value norm.
+        dividend = tf.math.reduce_max(tf.math.abs(limited))
+
+        # Need to stop the gradient here. Otherwise, for the maximum element,
+        # which gives the dividend, normed is limited/limited (for this one
+        # maximum digit). The derivative of y = x/x, dy/dx is just zero, when
+        # one does the simplification y = x/x = 1. But TF does NOT do this
+        # simplification when computing the gradient for the
+        # normed = limited/dividend operation. As a result, this gradient
+        # becomes complicated, because during the computation, "dividend" is
+        # not just a constant, but depends on "limited" instead. Here,
+        # tf.stop_gradient is used to mark "dividend" as a constant explicitly.
+        dividend = tf.stop_gradient(dividend)
+
+        # Norm and then scale from value range [-1,1] to [0,1] (the range
+        # expected by the core quantization operation).
+        # If the dividend used for the norm operation is 0, all elements of
+        # the weight tensor are 0 and divide_no_nan returns 0 for all weights.
+        # So if all elements of the weight tensor are zero, nothing is normed.
+        return tf.math.divide_no_nan(limited, 2.0 * dividend) + 0.5
+
     def call(self, inputs):
-        inputs = tf.clip_by_value(inputs, 0.0, 1.0)
+        # Depending on quantizer mode (activation or weight) just clip inputs
+        # on [0, 1] range or use weight preprocessing method.
+        if self.mode == "activations":
+            inputs = tf.clip_by_value(inputs, 0.0, 1.0)
+        elif self.mode == "weights":
+            inputs = self.weight_preprocess(inputs)
+        else:
+            raise ValueError(
+                f"Invalid DoReFa quantizer mode {self.mode}. "
+                "Valid values are 'activations' and 'weights'."
+            )
 
         @tf.custom_gradient
         def _k_bit_with_identity_grad(x):
             n = 2 ** self.precision - 1
             return tf.round(x * n) / n, lambda dy: dy
 
         outputs = _k_bit_with_identity_grad(inputs)
+
+        # Scale weights from [0, 1] quantization range back to [-1,1] range
+        if self.mode == "weights":
+            outputs = 2.0 * outputs - 1.0
+
         return super().call(outputs)
 
     def get_config(self):
-        return {**super().get_config(), "k_bit": self.precision}
+        return {**super().get_config(), "k_bit": self.precision, "mode": self.mode}
 
 
 # `DoReFa` used to be called `DoReFaQuantizer`; this alias is for

diff --git a/larq/quantizers_test.py b/larq/quantizers_test.py
@@ -1,3 +1,5 @@
+import functools
+
 import numpy as np
 import pytest
 import tensorflow as tf
@@ -66,6 +68,12 @@ def test_invalid_usage(self):
             lq.quantizers.get(42)
         with pytest.raises(ValueError):
             lq.quantizers.get("unknown")
+        with pytest.raises(ValueError):
+            lq.quantizers.DoReFa(k_bit=2, mode="unknown")
+        f = lq.quantizers.DoReFa(k_bit=2, mode="activations")
+        f.mode = "unknown"
+        with pytest.raises(ValueError):
+            f.call([0.0])
 
     @pytest.mark.parametrize("quantizer", ["input_quantizer", "kernel_quantizer"])
     def test_layer_as_quantizer(self, quantizer, keras_should_run_eagerly):
@@ -216,22 +224,34 @@ def test_ternarization_with_ternary_weight_networks(self):
         assert not np.any(result > 1)
         assert not np.any(result < -1)
 
-    def test_dorefa_quantize(self):
+    @pytest.mark.parametrize("k_bit", [1, 2, 4, 6, 8])
+    @pytest.mark.parametrize("mode", ["activations", "weights"])
+    def test_dorefa_quantize(self, k_bit, mode):
         x = tf.keras.backend.placeholder(ndim=2)
-        f = tf.keras.backend.function([x], [lq.quantizers.DoReFa(2)(x)])
+        f = tf.keras.backend.function([x], [lq.quantizers.DoReFa(k_bit, mode)(x)])
         real_values = testing_utils.generate_real_values_with_zeros()
         result = f([real_values])[0]
-        k_bit = 2
         n = 2 ** k_bit - 1
+        if mode == "weights":
+            # Create the preprocessed and scaled stimulus, which is then ready to
+            # go through the same test like for the activation quantizer
+            divider = np.amax(np.abs(np.tanh(real_values)))
+            real_values = np.tanh(real_values) / divider
+            real_values = (real_values / 2.0) + 0.5
+            # The results, which are currently on [-1, 1] range get the same
+            # scaling, so they behave like they were created on the activation
+            # range and can be tested like that
+            result = result / 2.0 + 0.5
         assert not np.any(result > 1)
         assert not np.any(result < 0)
         for i in range(n + 1):
-            assert np.all(
+            np.testing.assert_allclose(
                 result[
                     (real_values > (2 * i - 1) / (2 * n))
                     & (real_values < (2 * i + 1) / (2 * n))
-                ]
-                == i / n
+                ],
+                i / n,
+                atol=1e-6,
             )
 
 
@@ -325,19 +345,30 @@ def test_magnitude_aware_sign_grad(self):
             grad.numpy(), np.where(abs(a) < 1, np.ones(a.shape) * scale_vector, 0)
         )
 
-    def test_dorefa_ste_grad(self):
+    @pytest.mark.parametrize("mode", ["activations", "weights"])
+    def test_dorefa_ste_grad(self, mode):
         @np.vectorize
         def ste_grad(x):
             if x <= 1 and x >= 0:
                 return 1.0
             return 0.0
 
+        def tanh_grad(x):
+            # 1/(cosh**2) is the derivative of tanh. The gradients of the
+            # scaling operations cancel each other and the gradient of the
+            # quantizek function is supposed to be 1 everywhere, because it
+            # is used on its linear region only. tanh does all the limiting.
+            dividend = np.amax(np.abs(np.tanh(x)))
+            return 1 / (np.cosh(x) ** 2.0) / dividend
+
+        expected_gradient = ste_grad if mode == "activations" else tanh_grad
+
         x = testing_utils.generate_real_values_with_zeros(shape=(8, 3, 3, 16))
         tf_x = tf.Variable(x)
         with tf.GradientTape() as tape:
-            activation = lq.quantizers.DoReFa(2)(tf_x)
+            activation = lq.quantizers.DoReFa(2, mode)(tf_x)
         grad = tape.gradient(activation, tf_x)
-        np.testing.assert_allclose(grad.numpy(), ste_grad(x))
+        np.testing.assert_allclose(grad.numpy(), expected_gradient(x))
 
 
 @pytest.mark.parametrize(
@@ -350,6 +381,7 @@ def ste_grad(x):
         ("magnitude_aware_sign", lq.quantizers.MagnitudeAwareSign),
         ("ste_tern", lq.quantizers.SteTern),
         ("dorefa_quantizer", lq.quantizers.DoReFa),
+        ("dorefa_quantizer", functools.partial(lq.quantizers.DoReFa, mode="weights")),
     ],
 )
 def test_metrics(quantizer):