# FPGA ML Inference using oneapi backend

In [22]:
!which icpx

/opt/intel/oneapi/compiler/2024.2/bin/icpx


In [61]:
import os
import hls4ml
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Conv1D, Conv2D, Flatten, MaxPool1D, MaxPool2D, Activation, BatchNormalization, Dropout
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam
import tensorflow
import numpy as np

from qkeras.qconvolutional import QConv1D
from qkeras.qlayers import QDense
from qkeras.qrecurrent import QGRU
from qkeras.quantizers import quantized_bits, quantized_sigmoid, quantized_relu

## Conv1D and GRU

Example of Conv1D layers followed by GRU



In [None]:
def get_gru():
    model = Sequential()
    model.add(Conv1D(16, kernel_size=3, padding='same', input_shape = (32,3), activation='relu'))
    model.add(GRU(16))
    model.compile(loss='mse', optimizer=Adam())
    model.summary()

    config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='ac_fixed<16, 6>')
    hls_model = hls4ml.converters.convert_from_keras_model(model=model, output_dir="model_gru_out", backend="oneAPI", part="Agilex7", hls_config=config)

    return model, config, hls_model


gru_model, config, gru_hls = get_gru()

In [16]:
# Reference result on CPU
gru_model.predict(np.ones((1,32,3)))



array([[-0.39135367,  0.2110026 ,  0.11646275, -0.13366926, -0.00367545,
         0.0204969 , -0.15100211, -0.13510902, -0.07215655, -0.06282963,
         0.05902307, -0.09994041,  0.14548996,  0.15603784,  0.3336328 ,
        -0.08850994]], dtype=float32)

In [None]:
model = Sequential()
model.add(Conv1D(16, kernel_size=3, padding='same', input_shape = (32,3), activation='relu'))
model.add(GRU(16))
model.compile(loss='mse', optimizer=Adam())

config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='ac_fixed<16, 6>')
hls_model = hls4ml.converters.convert_from_keras_model(model=model, output_dir="model_gru_out", backend="oneAPI", part="Agilex7", hls_config=config)

In [17]:
# oneAPI backend
gru_hls.compile()

Writing HLS project
Done
/opt/intel/oneapi/compiler/2024.2/bin/icpx
-- Configuring the design to run on FPGA board Agilex7
-- Additional USER_FPGA_FLAGS=-Wno-unused-label
-- Additional USER_FLAGS=-Wno-unused-label;-fconstexpr-steps=134217728
-- Additional USER_INCLUDE_PATHS=src;src/firmware
-- Additional USER_LIB_PATHS=
-- Additional USER_LIBS=
-- Configuring done (0.0s)
-- Generating done (0.0s)
-- Build files have been written to: /home/sdemirso/HLS4ML/model_gru_out/build
[ 33%] [32mBuilding CXX object CMakeFiles/lib.dir/src/firmware/myproject.cpp.o[0m
[ 66%] [32mBuilding CXX object CMakeFiles/lib.dir/src/myproject_bridge.cpp.o[0m
[100%] [32m[1mLinking CXX shared library libmyproject-ef7237fa.so[0m


libunwind: __unw_add_dynamic_fde: bad fde: FDE is really a CIE


[100%] Built target lib


In [18]:
gru_hls.predict(np.ones((32,3)))

libunwind: __unw_add_dynamic_fde: bad fde: FDE is really a CIE


array([-0.39746094,  0.20214844,  0.10644531, -0.13574219, -0.015625  ,
        0.01074219, -0.15625   , -0.14160156, -0.08105469, -0.07324219,
        0.05078125, -0.10546875,  0.140625  ,  0.14355469,  0.32226562,
       -0.09472656])

In [None]:
!cd model_gru_out/build/ && ls

## MLP

In [27]:
def get_mlp():
    model = Sequential()
    model.add(Dense(64, input_shape=(16,), name='fc1', kernel_initializer='lecun_uniform'))
    model.add(Activation(activation='relu', name='relu1'))
    model.add(Dense(32, name='fc2', kernel_initializer='lecun_uniform'))
    model.add(Activation(activation='relu', name='relu2'))
    model.add(Dense(32, name='fc3', kernel_initializer='lecun_uniform'))
    model.add(Activation(activation='relu', name='relu3'))
    model.add(Dense(5, name='output', kernel_initializer='lecun_uniform'))
    model.add(Activation(activation='softmax', name='softmax'))
    model.compile(loss='mse', optimizer=Adam())
    model.summary()

    config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='ac_fixed<16, 6>')
    hls_model = hls4ml.converters.convert_from_keras_model(model=model, output_dir="model_mlp_out", backend="oneAPI", part="Agilex7", hls_config=config)

    return model, config, hls_model

mlp_cpu, config, mlp_hls = get_mlp()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 fc1 (Dense)                 (None, 64)                1088      
                                                                 
 relu1 (Activation)          (None, 64)                0         
                                                                 
 fc2 (Dense)                 (None, 32)                2080      
                                                                 
 relu2 (Activation)          (None, 32)                0         
                                                                 
 fc3 (Dense)                 (None, 32)                1056      
                                                                 
 relu3 (Activation)          (None, 32)                0         
                                                                 
 output (Dense)              (None, 5)                

In [28]:
mlp_hls.compile()

Writing HLS project
Done
/opt/intel/oneapi/compiler/2024.2/bin/icpx
-- The CXX compiler identification is IntelLLVM 2024.2.1
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /opt/intel/oneapi/compiler/2024.2/bin/icpx - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Configuring the design to run on FPGA board Agilex7
-- Additional USER_FPGA_FLAGS=-Wno-unused-label
-- Additional USER_FLAGS=-Wno-unused-label;-fconstexpr-steps=134217728
-- Additional USER_INCLUDE_PATHS=src;src/firmware
-- Additional USER_LIB_PATHS=
-- Additional USER_LIBS=
-- Configuring done (0.2s)
-- Generating done (0.0s)
-- Build files have been written to: /home/sdemirso/HLS4ML/model_mlp_out/build
[ 33%] [32mBuilding CXX object CMakeFiles/lib.dir/src/firmware/myproject.cpp.o[0m
[ 66%] [32mBuilding CXX object CMakeFiles/lib.dir/src/myproject_bridge.cpp.o[0m
[100%] [32m[1mLinking CXX shared library libmyproject-b1

libunwind: __unw_add_dynamic_fde: bad fde: FDE is really a CIE


[100%] Built target lib


In [29]:
mlp_hls.predict(np.ones(16))

libunwind: __unw_add_dynamic_fde: bad fde: FDE is really a CIE


array([0., 0., 1., 0., 0.])

## CNN - MNIST

In [72]:
def get_cnn():
    model = Sequential()
    model.add(Conv2D(16, (3, 3), activation='relu', padding='same', input_shape=(28, 28, 1)))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    config = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='ac_fixed<16, 6>')
    hls_model = hls4ml.converters.convert_from_keras_model(model=model, output_dir="model_cnn_out", backend="oneAPI", part="Agilex7", hls_config=config)

    return model, config, hls_model

cnn_cpu, config, cnn_hls = get_cnn()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 28, 28, 16)        160       
                                                                 
 max_pooling2d_8 (MaxPooling  (None, 14, 14, 16)       0         
 2D)                                                             
                                                                 
 conv2d_13 (Conv2D)          (None, 14, 14, 32)        4640      
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 7, 7, 32)         0         
 2D)                                                             
                                                                 
 flatten_5 (Flatten)         (None, 1568)              0         
                                                                 
 dense_6 (Dense)             (None, 10)              

In [73]:
cnn_hls.compile()

Writing HLS project
Done
/opt/intel/oneapi/compiler/2024.2/bin/icpx
-- Configuring the design to run on FPGA board Agilex7
-- Additional USER_FPGA_FLAGS=-Wno-unused-label
-- Additional USER_FLAGS=-Wno-unused-label;-fconstexpr-steps=134217728
-- Additional USER_INCLUDE_PATHS=src;src/firmware
-- Additional USER_LIB_PATHS=
-- Additional USER_LIBS=
-- Configuring done (0.0s)
-- Generating done (0.0s)
-- Build files have been written to: /home/sdemirso/HLS4ML/model_cnn_out/build
[ 33%] [32mBuilding CXX object CMakeFiles/lib.dir/src/firmware/myproject.cpp.o[0m
[ 66%] [32mBuilding CXX object CMakeFiles/lib.dir/src/myproject_bridge.cpp.o[0m
[100%] [32m[1mLinking CXX shared library libmyproject-d858c69a.so[0m


libunwind: __unw_add_dynamic_fde: bad fde: FDE is really a CIE


[100%] Built target lib


In [74]:
cnn_cpu.predict(np.ones((1,28,28,1)))



array([[0.09495136, 0.09383111, 0.11293188, 0.12613861, 0.0914408 ,
        0.07592314, 0.09947468, 0.10293911, 0.14101414, 0.06135505]],
      dtype=float32)

## Library and Layer Support

### ML framework support:

(Q)Keras

PyTorch

(Q)ONNX (in development)

### Neural network architectures:

Fully connected NN (multilayer perceptron, MLP)

Convolutional NN

Recurrent NN (LSTM, GRU)

Graph NN (GarNet)

### Layers:

- Core Layers

InputLayer, Dropout, Flatten, Dense, TernaryDense, BinaryDense, Transpose, Resize

- Convolution

Conv1D, Conv2D

- Pooling

MaxPooling1D, MaxPooling2D, AveragePooling1D, AveragePooling2D

- Normalization

BatchNormalization

- Activation

LeakyReLU, ThresholdedReLU, Sigmoid, ELU, PReLU, TanH, Binary TanH, Softmax, Softsign, SELU Activation

## Quantized Conv1D using QKeras
**Note:** QKeras QGRU has a bug https://github.com/google/qkeras/issues/72 and it has a simple fix which is not merged in the main https://github.com/google/qkeras/pull/89 and in addition it does not work with newer version of TensorFlow since this commit: https://github.com/tensorflow/tensorflow/commit/f564400be34ed5c82f448699a2d91e1135d76f97

In [None]:
def readucr(filename):
    data = np.loadtxt(filename, delimiter="\t")
    y = data[:, 0]
    x = data[:, 1:]
    return x, y.astype(int)


root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"

x_train, y_train = readucr(root_url + "FordA_TRAIN.tsv")
x_test, y_test = readucr(root_url + "FordA_TEST.tsv")


In [None]:
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))


In [None]:
num_classes = len(np.unique(y_train))

In [None]:
idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]


In [None]:
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0
max_value = np.max(x_train)
x_train = x_train/max_value
x_test = x_test/max_value

In [None]:
def get_qconv():
    model = Sequential()
    model.add(QConv1D(16, kernel_size=3, padding='valid', input_shape = (500,1), activation=quantized_relu(8), kernel_quantizer=quantized_bits(12,1,1,alpha=1), bias_quantizer=quantized_bits(12,1,1,alpha=1)))
    model.add(QConv1D(32, kernel_size=3, padding='valid', activation=quantized_relu(8), kernel_quantizer=quantized_bits(8,1,1,alpha=1), bias_quantizer=quantized_bits(8,1,1,alpha=1)))
    model.add(MaxPool1D())
    model.add(QConv1D(64, kernel_size=3, padding='valid', activation=quantized_relu(8), kernel_quantizer=quantized_bits(8,1,1,alpha=1), bias_quantizer=quantized_bits(8,1,1,alpha=1)))
    #model.add(MaxPool1D())
    #model.add(QConv1D(128, kernel_size=3, padding='valid', activation=quantized_relu(8), kernel_quantizer=quantized_bits(8,0,1,alpha=1), bias_quantizer=quantized_bits(8,0,1,alpha=1)))
    model.add(MaxPool1D())

    model.add(Flatten())
    model.add(QDense(32, activation=quantized_relu(8), kernel_quantizer=quantized_bits(12,1,1,alpha=1), bias_quantizer=quantized_bits(8,1,1,alpha=1)))
    
    model.add(QDense(1, activation=quantized_sigmoid(8, use_real_sigmoid=True), kernel_quantizer=quantized_bits(8,1,1,alpha=1), bias_quantizer=quantized_bits(8,1,1,alpha=1)))
    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    model.summary()


    return model


conv_model= get_qconv()

In [None]:
history = conv_model.fit(
    x_train,
    y_train,
    batch_size=32,
    epochs=20,
    validation_split=0.2,
    verbose=1,
)

In [None]:
config = hls4ml.utils.config_from_keras_model(conv_model, granularity='name', default_precision='ac_fixed<18, 3>')
hls_model = hls4ml.converters.convert_from_keras_model(model=conv_model, output_dir="model_conv1d_out", backend="oneAPI", part="Agilex7", hls_config=config)


In [None]:
hls_model.compile()

### Try predicting with the models

In [None]:
hls_model.predict(x_train[-1])

In [None]:
conv_model.predict(x_train[-1].reshape((1,500,1)))

### Accuracy of the models

In [None]:
keras_preds = conv_model.predict(x_test)

hls_preds = hls_model.predict(x_test)


In [None]:
print(f"Trained model: {np.mean(y_test == np.round(keras_preds.flatten()))*100:.1f}% vs. HLS emulation: {np.mean(y_test == np.round(hls_preds.flatten()))*100:.2f}%")

In [None]:
print(f"Predictions of trained and HLS converted models are the same: {np.mean(np.round(hls_preds.flatten()) == np.round(keras_preds.flatten()))*100:.2f}%")