##### Copyright 2024 The AI Edge Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Post-training dynamic range quantization

## Overview

[TensorFlow Lite](https://www.tensorflow.org/lite/) now supports
converting weights to 8-bit precision as part of model conversion from
tensorflow graphdefs to TensorFlow Lite's flat buffer format. Dynamic range quantization achieves a 4x reduction in the model size. In addition, TFLite supports on-the-fly quantization and dequantization of activations to allow for:

1.  Using quantized kernels for faster implementation when available.
2.  Mixing of floating-point kernels with quantized kernels for different parts
    of the graph.

The activations are always stored in floating point. For ops that
support quantized kernels, the activations are quantized to 8 bits of precision
dynamically prior to processing and are de-quantized to float precision after
processing. Depending on the model being converted, this can give a speedup over
pure floating point computation.

In contrast to
[quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.14/tensorflow/contrib/quantize)
, the weights are quantized post training and the activations are quantized dynamically
at inference in this method.
Therefore, the model weights are not retrained to compensate for quantization
induced errors. It is important to check the accuracy of the quantized model to
ensure that the degradation is acceptable.

This tutorial trains an CIFAR model from scratch, checks its accuracy in
TensorFlow, and then converts the model into a Tensorflow Lite flatbuffer
with dynamic range quantization. Finally, it checks the
accuracy of the converted model and compares it to the original float model.

## Build an CIFAR10 model

### Setup

In [None]:
import logging
logging.getLogger("tensorflow").setLevel(logging.DEBUG)

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pathlib

DEBUG:tensorflow:Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.


### Train a TensorFlow model

In [None]:
# Load CIFAR dataset
cifar10 = keras.datasets.cifar10
(train_images, train_labels), (test_images, test_labels) = cifar10.load_data()

# # Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images / 255.0
test_images = test_images / 255.0

# Define the model architecture
model = keras.Sequential([
  keras.layers.InputLayer(input_shape=(32, 32, 3)),
  keras.layers.Reshape(target_shape=(32, 32, 3)),
  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),
  keras.layers.MaxPooling2D(pool_size=(2, 2)),
  keras.layers.Flatten(),
  keras.layers.Dense(10)
])

# Train the digit classification model
model.compile(optimizer='adam',
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(
  train_images,
  train_labels,
  epochs=10,
  validation_data=(test_images, test_labels)
)



Epoch 1/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.3600 - loss: 1.8153 - val_accuracy: 0.5143 - val_loss: 1.3995
Epoch 2/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5255 - loss: 1.3600 - val_accuracy: 0.5478 - val_loss: 1.2821
Epoch 3/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.5713 - loss: 1.2420 - val_accuracy: 0.5684 - val_loss: 1.2427
Epoch 4/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5856 - loss: 1.1861 - val_accuracy: 0.5707 - val_loss: 1.2196
Epoch 5/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.6099 - loss: 1.1281 - val_accuracy: 0.5853 - val_loss: 1.1921
Epoch 6/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.6230 - loss: 1.0933 - val_accuracy: 0.5895 - val_loss: 1.1783
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x7f413f5b1350>

For example, since you trained the model for just a single epoch, so it only trains to ~96% accuracy.


### Convert to a TensorFlow Lite model

Using the TensorFlow Lite [Converter](https://www.tensorflow.org/lite/models/convert), you can now convert the trained model into a TensorFlow Lite model.

Now load the model using the `TFLiteConverter`:

In [None]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

INFO:tensorflow:Assets written to: /tmp/tmp4a5vb_tx/assets


Saved artifact at '/tmp/tmp4a5vb_tx'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 32, 32, 3), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  139918218055696: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139918218057808: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139918218059152: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139918218057040: TensorSpec(shape=(), dtype=tf.resource, name=None)


Write it out to a tflite file:

In [None]:
tflite_models_dir = pathlib.Path("/tmp/cifar_tflite_models/")
tflite_models_dir.mkdir(exist_ok=True, parents=True)

In [None]:
tflite_model_file = tflite_models_dir/"cifar_model.tflite"
tflite_model_file.write_bytes(tflite_model)

111520

To quantize the model on export, set the `optimizations` flag to optimize for size:

In [None]:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_quant_model = converter.convert()
tflite_model_quant_file = tflite_models_dir/"cifar_model_quant.tflite"
tflite_model_quant_file.write_bytes(tflite_quant_model)

INFO:tensorflow:Assets written to: /tmp/tmp42jf_72a/assets


Saved artifact at '/tmp/tmp42jf_72a'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 32, 32, 3), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  139918218055696: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139918218057808: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139918218059152: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139918218057040: TensorSpec(shape=(), dtype=tf.resource, name=None)


30608

Note how the resulting file, is approximately `1/4` the size.

In [None]:
!ls -lh {tflite_models_dir}

total 144K
-rw-r--r-- 1 root root  30K Feb 21 06:19 cifar_model_quant.tflite
-rw-r--r-- 1 root root 109K Feb 21 06:19 cifar_model.tflite


## Run the TFLite models

Run the TensorFlow Lite model using the Python TensorFlow Lite
Interpreter.


### Load the model into an interpreter

In [None]:
interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))
interpreter.allocate_tensors()

In [None]:
interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))
interpreter_quant.allocate_tensors()

### Evaluate the models

In [None]:
# A helper function to evaluate the TF Lite model using "test" dataset.
def evaluate_model(interpreter):
  input_index = interpreter.get_input_details()[0]["index"]
  output_index = interpreter.get_output_details()[0]["index"]

  # Run predictions on every image in the "test" dataset.
  prediction_digits = []
  for test_image in test_images:
    # Pre-processing: add batch dimension and convert to float32 to match with
    # the model's input data format.
    test_image = np.expand_dims(test_image, axis=0).astype(np.float32)
    interpreter.set_tensor(input_index, test_image)

    # Run inference.
    interpreter.invoke()

    # Post-processing: remove batch dimension and find the digit with the highest
    # probability.
    output = interpreter.tensor(output_index)
    digit = np.argmax(output()[0])
    prediction_digits.append(digit)

  # Compare prediction results with ground truth labels to calculate accuracy.
  accurate_count = 0
  for index in range(len(prediction_digits)):
    if prediction_digits[index] == test_labels[index]:
      accurate_count += 1
  accuracy = accurate_count * 1.0 / len(prediction_digits)

  return accuracy

In [None]:
print(evaluate_model(interpreter))

0.5921


Repeat the evaluation on the dynamic range quantized model to obtain:


In [None]:
print(evaluate_model(interpreter_quant))

0.5921


In this example, the compressed model has no difference in the accuracy.