In [1]:
# a model without any quantization is trained, then run inference with the model with quantized activation function 
# for different quantization levels, and compare their accuracy.
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist, fashion_mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import os
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

In [5]:
cifar10 = tf.keras.datasets.cifar10
(train_x,train_y),(test_x,test_y) = cifar10.load_data()
print('\n train_x:%s, train_y:%s, test_x:%s, test_y:%s'%(train_x.shape,train_y.shape,test_x.shape,test_y.shape)) 

x_train, x_test = train_x / 255.0, test_x / 255.0  # Normalize to [0,1]
y_train,y_test = tf.cast(train_y,tf.int8),tf.cast(test_y,tf.int8)



def create_model():
    model = Sequential([
    # Convolutional layer 1
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=x_train.shape[1:]),
    BatchNormalization(),
    
    # Convolutional layer 2
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),
    
    # # Convolutional layer 3
    # Conv2D(128, (3, 3), activation='relu'),
    # Dropout(0.4),
    
    Flatten(),
    
    # Fully connected layer 1
    Dense(64, activation='relu'),
    Dropout(0.3),
    
    # Output layer
    Dense(10, activation='softmax')
])

    # Compile the model
    model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['sparse_categorical_accuracy']) 

    return model

# Train the model without quantization
model = create_model()
model.fit(x_train, y_train, epochs=5, validation_split=0.1)


 train_x:(50000, 32, 32, 3), train_y:(50000, 1), test_x:(10000, 32, 32, 3), test_y:(10000, 1)


2024-06-04 14:55:44.555713: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 72.0KiB (rounded to 73728)requested by op AddV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-06-04 14:55:44.555746: I tensorflow/core/common_runtime/bfc_allocator.cc:1010] BFCAllocator dump for GPU_0_bfc
2024-06-04 14:55:44.555758: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (256): 	Total Chunks: 37, Chunks in use: 37. 9.2KiB allocated for chunks. 9.2KiB in use in bin. 3.3KiB client-requested in use in bin.
2024-06-04 14:55:44.555766: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (512): 	Total Chunks: 1, Chunks in use: 1. 512B allocated for chunks. 512B in use in bin. 512B client-requested in use in bin.
2024-06-04 14:55:44.555774: I tensorflow/core/comm

ResourceExhaustedError: failed to allocate memory [Op:AddV2]

In [None]:
# Evaluate the model without quantized activation function during inference
original_accuracy = model.evaluate(x_test, y_test, verbose=0)[1]  # Get accuracy
print(f"Original Model Accuracy: {original_accuracy:.4f}")


def quantized_relu(x, levels):
    x = tf.nn.relu(x)
    max_val = tf.reduce_max(x)
    # Normalize the clipped output to [0, 1] for quantization
    x_normalized = x / max_val
    # Quantize the normalized output
    x_quantized = tf.round(x_normalized * (levels - 1)) / (levels - 1)
    # Scale back to [0, max_val]
    x_scaled_back = x_quantized * max_val
    return x_scaled_back
    
def quantized_softmax(x, levels):
    x_softmax = tf.nn.softmax(x)
    # Since softmax outputs are already in [0, 1], we can quantize them directly
    x_quantized = tf.round(x_softmax * (levels - 1)) / (levels - 1)
    return x_quantized

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  
quantization_levels = [4, 8, 16, 32]
accuracy_list = []

for levels in quantization_levels:
    new_model = Sequential([
    # Convolutional layer 1
    Conv2D(32, kernel_size=(3, 3), activation=lambda x: quantized_relu(x, levels), input_shape=x_train.shape[1:]),
    BatchNormalization(),
    
    # Convolutional layer 2
    Conv2D(64, (3, 3), activation=lambda x: quantized_relu(x, levels)),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),
    
    # Convolutional layer 3
    Conv2D(128, (3, 3), activation=lambda x: quantized_relu(x, levels)),
    Dropout(0.4),
    
    Flatten(),
    
    # Fully connected layer 1
    Dense(128, activation=lambda x: quantized_relu(x, levels)),
    Dropout(0.3),
    
    # Output layer
    Dense(10, activation=lambda x: quantized_softmax(x, levels))
    ])

    # get trained weights
    for layer, new_layer in zip(model.layers, new_model.layers):
        new_layer.set_weights(layer.get_weights())
        
    # Compile the model
    new_model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['sparse_categorical_accuracy']) 

    
    
    # Generate representative dataset for quantization
    def representative_dataset_gen():
        for _ in range(100):
            # Get sample input data as a numpy array in a method of your choosing
            yield [x_train[np.random.randint(x_train.shape[0], size=1)]]

    # Convert the model to a TensorFlow Lite model with quantization
    converter = tf.lite.TFLiteConverter.from_keras_model(new_model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.representative_dataset = representative_dataset_gen
    # Ensure that if any ops can't be quantized, they are converted to float32 instead of failing
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    # Ensure the input and output tensors are also quantized to int8
    converter.inference_input_type = tf.uint8
    converter.inference_output_type = tf.uint8
    
    tflite_model = converter.convert()
    
    # Get the current working directory
    current_directory = os.getcwd()
    # Save the quantized model in the current directory
    model_path = os.path.join(current_directory, f'quantized_mnist_model_{levels}levels.tflite')
    with open(model_path, 'wb') as f:
        f.write(tflite_model)
    
    print(f"Quantized model saved to: {model_path}")



    # Load the TFLite model
    interpreter = tf.lite.Interpreter(model_path=f'quantized_mnist_model_{levels}levels.tflite')
    interpreter.allocate_tensors()
    # Get input and output details
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    # Convert test data to uint8
    x_test_uint8 = (x_test * 255).astype(np.uint8)
    # Function to run inference on a single image
    def run_inference(image):
        interpreter.set_tensor(input_details[0]['index'], image)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details[0]['index'])
        return np.argmax(output)
    # Compute accuracy on the test dataset
    correct_predictions = 0
    total_predictions = 0
    
    for i in range(len(x_test_uint8)):
        image = np.expand_dims(x_test_uint8[i], axis=0)
        label = y_test[i]
        prediction = run_inference(image)
        if prediction == label:
            correct_predictions += 1
        total_predictions += 1
    
    accuracy = correct_predictions / total_predictions
    accuracy_list.append(accuracy)
    print(f"Quantized Model Accuracy with Quantized relu({levels} levels): {accuracy * 100:.2f}%")

In [None]:
i = 0
for levels in quantization_levels:
    print(f"Quantized Model Accuracy with Quantized relu({levels} levels): {accuracy_list[i] * 100:.2f}%")
    i = i + 1