# Convolutional Neural Networks on PYNQ

Let's use the same quantized and pruned model as in the CNN tutorial (part6_cnns.ipynb)

## Start with the neccessary imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import time
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

## Fetch the SVHN dataset using Tensorflow Dataset

In [2]:
ds_train, info = tfds.load('svhn_cropped', split='train[:90%]', with_info=True, as_supervised=True,data_dir="/eos/home-t/thaarres/tensorflow_datasets/")
ds_test        = tfds.load('svhn_cropped', split='test', shuffle_files=True, as_supervised=True,data_dir="/eos/home-t/thaarres/tensorflow_datasets/")
ds_val         = tfds.load('svhn_cropped', split='train[-10%:]', shuffle_files=True, as_supervised=True,data_dir="/eos/home-t/thaarres/tensorflow_datasets/")

assert isinstance(ds_train, tf.data.Dataset)
train_size  = int(info.splits['train'].num_examples)
input_shape = info.features['image'].shape 
n_classes   = info.features['label'].num_classes 

print('Training on {} samples of input shape {}, belonging to {} classes'.format(train_size,input_shape,n_classes))

Training on 73257 samples of input shape (32, 32, 3), belonging to 10 classes


In [3]:
def preprocess(image, label,nclasses=10):
  image = tf.cast(image, tf.float32) / 255.
  label = tf.one_hot(tf.squeeze(label), nclasses)
  return image, label

In [4]:
batch_size = 1024

train_data = ds_train.map(preprocess,n_classes) #Get dataset as image and one-hot encoded labels, divided by max RGB   
train_data = train_data.shuffle(4096).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

for example in train_data.take(1):
    break
print("X train batch shape = {}, Y train batch shape = {} ".format(example[0].shape, example[1].shape))

val_data = ds_val.map(preprocess,n_classes)    
val_data = val_data.batch(batch_size)
val_data = val_data.prefetch(tf.data.experimental.AUTOTUNE)

# For  testing, we get the full dataset in memory as it's rather small.
# We fetch it as numpy arrays to have access to labels and images separately
X_test, Y_test = tfds.as_numpy(tfds.load('svhn_cropped',split='test',batch_size=-1,as_supervised=True,data_dir="/eos/home-t/thaarres/tensorflow_datasets/"))
X_test, Y_test = preprocess(X_test, Y_test,nclasses=n_classes)
print("X test batch shape = {}, Y test batch shape = {} ".format(X_test.shape,Y_test.shape))

X train batch shape = (1024, 32, 32, 3), Y train batch shape = (1024, 10) 
X test batch shape = (26032, 32, 32, 3), Y test batch shape = (26032, 10) 


## Define a quantized and pruned model

In [5]:
from qkeras import QActivation
from qkeras import QDense, QConv2DBatchnorm
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.regularizers import l1
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense

from tensorflow.keras.models import Model

filters_per_conv_layer = [16,16,24]
neurons_per_dense_layer = [42,64]

x = x_in = Input(shape=input_shape)

for i,f in enumerate(filters_per_conv_layer):
    print( ('Adding fused QConv+BN block {} with N={} filters').format(i,f) )
    x = QConv2DBatchnorm(int(f), kernel_size=(3,3), strides=(1,1),
                         kernel_quantizer="quantized_bits(6,0,alpha=1)", 
                         bias_quantizer="quantized_bits(6,0,alpha=1)",
                         kernel_initializer='lecun_uniform', kernel_regularizer=l1(0.0001), use_bias=True,
                         name='fused_convbn_{}'.format(i))(x) 
    x = QActivation('quantized_relu(6)',name='conv_act_%i'%i)(x)
    x = MaxPooling2D(pool_size = (2,2),name='pool_{}'.format(i) )(x)
x = Flatten()(x)

for i,n in enumerate(neurons_per_dense_layer):
  print( ('Adding QDense block {} with N={} neurons').format(i,n) )
  x = QDense(n,
            kernel_quantizer="quantized_bits(6,0,alpha=1)",
            kernel_initializer='lecun_uniform', kernel_regularizer=l1(0.0001),name='dense_%i'%i, use_bias=False)(x)
  x = BatchNormalization(name='bn_dense_{}'.format(i))(x)
  x = QActivation('quantized_relu(6)',name='dense_act_%i'%i)(x)
x = Dense(int(n_classes),
           name='output_dense')(x)
x_out = Activation('softmax',name='output_softmax')(x)
qmodel = Model(inputs=[x_in], outputs=[x_out], name='qkeras')

qmodel.summary()

Adding fused QConv+BN block 0 with N=16 filters
Adding fused QConv+BN block 1 with N=16 filters
Adding fused QConv+BN block 2 with N=24 filters
Adding QDense block 0 with N=42 neurons
Adding QDense block 1 with N=64 neurons
Model: "qkeras"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 32, 3)]       0         
_________________________________________________________________
fused_convbn_0 (QConv2DBatch (None, 30, 30, 16)        513       
_________________________________________________________________
conv_act_0 (QActivation)     (None, 30, 30, 16)        0         
_________________________________________________________________
pool_0 (MaxPooling2D)        (None, 15, 15, 16)        0         
_________________________________________________________________
fused_convbn_1 (QConv2DBatch (None, 13, 13, 16)        2385      
__________________________________

In [6]:
# Print the quantized layers
from qkeras.autoqkeras.utils import print_qmodel_summary
print_qmodel_summary(qmodel)        

fused_convbn_0       f=16 quantized_bits(6,0,0,alpha=1) quantized_bits(6,0,0,alpha=1) 
conv_act_0           quantized_relu(6)
fused_convbn_1       f=16 quantized_bits(6,0,0,alpha=1) quantized_bits(6,0,0,alpha=1) 
conv_act_1           quantized_relu(6)
fused_convbn_2       f=24 quantized_bits(6,0,0,alpha=1) quantized_bits(6,0,0,alpha=1) 
conv_act_2           quantized_relu(6)
dense_0              u=42 quantized_bits(6,0,0,alpha=1) 
bn_dense_0           is normal keras bn layer
dense_act_0          quantized_relu(6)
dense_1              u=64 quantized_bits(6,0,0,alpha=1) 
bn_dense_1           is normal keras bn layer
dense_act_1          quantized_relu(6)



In [7]:
import tensorflow_model_optimization as tfmot
from tensorflow_model_optimization.sparsity import keras as sparsity
from tensorflow_model_optimization.python.core.sparsity.keras import pruning_callbacks

NSTEPS = int(train_size*0.9)  // batch_size #90% train, 10% validation in 10-fold cross validation
print('Number of training steps per epoch is {}'.format(NSTEPS))

# Prune all convolutional and dense layers gradually from 0 to 50% sparsity every 2 epochs, 
# ending by the 10th epoch
def pruneFunction(layer):
    pruning_params = {'pruning_schedule': sparsity.PolynomialDecay(initial_sparsity = 0.0,
                                                                   final_sparsity = 0.50, 
                                                                   begin_step = NSTEPS*2, 
                                                                   end_step = NSTEPS*10, 
                                                                   frequency = NSTEPS)
                     }
    if isinstance(layer, tf.keras.layers.Conv2D):
      return tfmot.sparsity.keras.prune_low_magnitude(layer, **pruning_params)
    if isinstance(layer, tf.keras.layers.Dense) and layer.name!='output_dense':
      return tfmot.sparsity.keras.prune_low_magnitude(layer, **pruning_params)  
    return layer

qmodel_pruned = tf.keras.models.clone_model( qmodel, clone_function=pruneFunction)

Number of training steps per epoch is 64
Instructions for updating:
Please use `layer.add_weight` method instead.


Instructions for updating:
Please use `layer.add_weight` method instead.


In [8]:
train = False

n_epochs = 30
if train:
    LOSS        = tf.keras.losses.CategoricalCrossentropy()
    OPTIMIZER   = tf.keras.optimizers.Adam(learning_rate=3E-3, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True) 
    qmodel_pruned.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=["accuracy"])

    callbacks = [
            tf.keras.callbacks.EarlyStopping(patience=10, verbose=1),
            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1),
            pruning_callbacks.UpdatePruningStep()
            ]  

    start = time.time()
    history = qmodel_pruned.fit(train_data,
                          epochs = n_epochs,
                          validation_data = val_data,
                          callbacks = callbacks, 
                          verbose=1)     
    end = time.time()
    print('\n It took {} minutes to train!\n'.format( (end - start)/60.))

    qmodel_pruned.save('pynq_cnn_model.h5')

else:
    from qkeras.utils import _add_supported_quantized_objects
    from tensorflow_model_optimization.python.core.sparsity.keras import pruning_wrapper
    
    co = {}
    _add_supported_quantized_objects(co)
    co['PruneLowMagnitude'] = pruning_wrapper.PruneLowMagnitude
    qmodel_pruned = tf.keras.models.load_model('pynq_cnn_model.h5', custom_objects=co)

## Make bitfile


In [9]:
from tensorflow_model_optimization.sparsity.keras import strip_pruning
from tensorflow_model_optimization.python.core.sparsity.keras import pruning_wrapper
from qkeras import QActivation
from qkeras import QDense, QConv2DBatchnorm

model = tf.keras.models.load_model('pynq_cnn_model.h5',custom_objects={'PruneLowMagnitude': pruning_wrapper.PruneLowMagnitude,
                                                                         'QDense': QDense, 
                                                                         'QConv2DBatchnorm': QConv2DBatchnorm, 
                                                                         'QActivation': QActivation})
model  = strip_pruning(model)

In [None]:
import hls4ml
hls4ml.model.optimizer.OutputRoundingSaturationMode.layers = ['Activation']
hls4ml.model.optimizer.OutputRoundingSaturationMode.rounding_mode = 'AP_RND'
hls4ml.model.optimizer.OutputRoundingSaturationMode.saturation_mode = 'AP_SAT'
hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name')

hls_config['Model'] = {}
hls_config['Model']['ReuseFactor'] = 2
hls_config['Model']['Strategy'] = 'Resource'
hls_config['Model']['Precision'] = 'ap_fixed<16,6>'
hls_config['LayerName']['fused_convbn_0']['ReuseFactor'] = 9
hls_config['LayerName']['fused_convbn_1']['ReuseFactor'] = 36
hls_config['LayerName']['fused_convbn_2']['ReuseFactor'] = 36
hls_config['LayerName']['dense_0']['ReuseFactor'] = 672
hls_config['LayerName']['dense_1']['ReuseFactor'] = 366
hls_config['LayerName']['output_dense']['ReuseFactor'] = 128

hls_model = hls4ml.converters.convert_from_keras_model(model=model,
                                                       backend='VivadoAccelerator',
                                                       io_type='io_stream', 
                                                       device='pynq-z2',
                                                       hls_config=hls_config, 
                                                       output_dir="pynq_cnn_pynq-z2")

hls_model.build(csim=False, synth=True, export=True)
hls4ml.report.read_vivado_report('pynq_cnn_pynq-z2/')

hls4ml.templates.VivadoAcceleratorBackend.make_bitfile(hls_model)

Interpreting Model
Topology:
Layer name: input_1, layer type: Input
Layer name: fused_convbn_0, layer type: QConv2DBatchnorm
Layer name: conv_act_0, layer type: QActivation
Layer name: pool_0, layer type: MaxPooling2D
Layer name: fused_convbn_1, layer type: QConv2DBatchnorm
Layer name: conv_act_1, layer type: QActivation
Layer name: pool_1, layer type: MaxPooling2D
Layer name: fused_convbn_2, layer type: QConv2DBatchnorm
Layer name: conv_act_2, layer type: QActivation
Layer name: pool_2, layer type: MaxPooling2D
Layer name: dense_0, layer type: QDense
Layer name: bn_dense_0, layer type: BatchNormalization
Layer name: dense_act_0, layer type: QActivation
Layer name: dense_1, layer type: QDense
Layer name: bn_dense_1, layer type: BatchNormalization
Layer name: dense_act_1, layer type: QActivation
Layer name: output_dense, layer type: Dense
  -> Activation (linear), layer name: output_dense
Layer name: output_softmax, layer type: Activation
Interpreting Model
Topology:
Layer name: input_1

In [None]:
X_test_reduced = X_test[:3000]
Y_test_reduced = Y_test[:3000]

y_predict        = model.predict(X_test_reduced)
y_predict_hls4ml = hls_model.predict(np.ascontiguousarray(X_test_reduced))

import plotting
from sklearn.metrics import accuracy_score

def plotROC(Y, y_pred, y_pred_hls4ml, label="Model"):
    
    accuracy_keras  = float(accuracy_score (np.argmax(Y,axis=1), np.argmax(y_pred,axis=1)))
    accuracy_hls4ml = float(accuracy_score (np.argmax(Y,axis=1), np.argmax(y_pred_hls4ml,axis=1)))

    print("Accuracy Keras:  {}".format(accuracy_keras))
    print("Accuracy hls4ml: {}".format(accuracy_hls4ml))
    
    fig, ax = plt.subplots(figsize=(9, 9))
    _ = plotting.makeRoc(Y, y_pred, labels=['%i'%nr for nr in range(n_classes)])
    plt.gca().set_prop_cycle(None) # reset the colors
    _ = plotting.makeRoc(Y, y_pred_hls4ml, labels=['%i'%nr for nr in range(n_classes)], linestyle='--')

    from matplotlib.lines import Line2D
    lines = [Line2D([0], [0], ls='-'),
             Line2D([0], [0], ls='--')]
    from matplotlib.legend import Legend
    leg = Legend(ax, lines, labels=['Keras', 'hls4ml'],
                loc='lower right', frameon=False)
    ax.add_artist(leg)
    plt.figtext(0.2, 0.38,label, wrap=True, horizontalalignment='left',verticalalignment='center')
    plt.ylim(0.01,1.)
    plt.xlim(0.7,1.)

# Plot the ROC   
plotROC(Y_test_reduced,y_predict,y_predict_hls4ml,label="QKeras") 

In [None]:
def getReports(indir):
    data_ = {}
    
    report_vsynth = Path('{}/vivado_synth.rpt'.format(indir))
    report_csynth = Path('{}/myproject_prj/solution1/syn/report/myproject_csynth.rpt'.format(indir))
    
    if report_vsynth.is_file() and report_csynth.is_file():
        print('Found valid vsynth and synth in {}! Fetching numbers'.format(indir))
        
        # Get the resources from the logic synthesis report 
        with report_vsynth.open() as report:
          lines = np.array(report.readlines())
          data_['lut']     = int(lines[np.array(['CLB LUTs*' in line for line in lines])][0].split('|')[2])
          data_['ff']      = int(lines[np.array(['CLB Registers' in line for line in lines])][0].split('|')[2])
          data_['bram']    = float(lines[np.array(['Block RAM Tile' in line for line in lines])][0].split('|')[2])
          data_['dsp']     = int(lines[np.array(['DSPs' in line for line in lines])][0].split('|')[2])
          data_['lut_rel'] = float(lines[np.array(['CLB LUTs*' in line for line in lines])][0].split('|')[5])
          data_['ff_rel']  = float(lines[np.array(['CLB Registers' in line for line in lines])][0].split('|')[5])
          data_['bram_rel']= float(lines[np.array(['Block RAM Tile' in line for line in lines])][0].split('|')[5])
          data_['dsp_rel'] = float(lines[np.array(['DSPs' in line for line in lines])][0].split('|')[5])
        
        with report_csynth.open() as report:
          lines = np.array(report.readlines())
          lat_line = lines[np.argwhere(np.array(['Latency (cycles)' in line for line in lines])).flatten()[0] + 3]
          data_['latency_clks'] = int(lat_line.split('|')[2])
          data_['latency_mus']  = float(lat_line.split('|')[2])*5.0/1000.
          data_['latency_ii']   = int(lat_line.split('|')[6])
    
    return data_

from pathlib import Path
import pprint 

data = getReports('pynq_cnn')

print("\n Resource usage and latency: PYNQ CNN")
pprint.pprint(data)


In [None]:
# Now the big guy:

hls4ml.model.optimizer.OutputRoundingSaturationMode.saturation_mode = 'AP_SAT'
hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name')

hls_config['Model'] = {}
hls_config['Model']['ReuseFactor'] = 1
hls_config['Model']['Strategy'] = 'Resource'
hls_config['Model']['Precision'] = 'ap_fixed<16,6>'

hls_model = hls4ml.converters.convert_from_keras_model(model=model,
                                                       backend='VivadoAccelerator',
                                                       io_type='io_stream', 
                                                       device='zcu102',
                                                       hls_config=hls_config, 
                                                       output_dir="pynq_cnn_zcu102")

hls_model.build(csim=False, synth=True, export=True)
hls4ml.report.read_vivado_report('pynq_cnn_zcu102/')

hls4ml.templates.VivadoAcceleratorBackend.make_bitfile(hls_model)

In [None]:
y_predict_hls4ml = hls_model.predict(np.ascontiguousarray(X_test_reduced))
plotROC(Y_test_reduced,y_predict,y_predict_hls4ml,label="zcu102") 

data = getReports('pynq_cnn_zcu102')

print("\n Resource usage and latency: zcu102 CNN")
pprint.pprint(data)