# Quantized Deployment and Throughput/Accuracy Testing
This notebook deploys the quantized model on an FPGA and performs throughput and accuracy testing.
If the desired deployment is on a PYNQ board, this notebook should be run from the PYNQ host.

In [1]:
from output_final.driver.finn.core.datatype import DataType
from driver_base import FINNExampleOverlay
import numpy as np
import proc_for_infer as pfi
from utils_pynq import load_split_np_data

In [43]:
#set up model info
# dictionary describing the I/O of the FINN-generated accelerator
io_shape_dict = {
    # FINN DataType for input and output tensors
    "idt" : DataType.UINT6,
    "odt" : DataType.INT6,
    # shapes for input and output tensors (NHWC layout)
    "ishape_normal" : (1, 14),
    "oshape_normal" : (1, 1),
    # folded / packed shapes below depend on idt/odt and input/output
    # PE/SIMD parallelization settings -- these are calculated by the
    # FINN compiler.
    "ishape_folded" : (1, 14, 1),
    "oshape_folded" : (1, 1, 1),
    "ishape_packed" : (1, 14, 1),
    "oshape_packed" : (1, 1, 1)
}
platform = "zynq-iodma"
bitfile = "output_final/deploy/bitfile/finn-accel.bit"
runtime_weights = "output_final/deploy/driver/runtime_weights/"
batch_size = 50

In [44]:
#create accelerator
accel = FINNExampleOverlay(bitfile_name=bitfile, platform=platform, 
                          io_shape_dict=io_shape_dict, batch_size=batch_size, fclk_mhz=50,
                          runtime_weight_dir=runtime_weights)

### Throughput Test
Run a basic throughput test and print the results

In [45]:
res = accel.throughput_test()
print(res)

{'runtime[ms]': 0.6701946258544922, 'throughput[images/s]': 74605.19388118107, 'DRAM_in_bandwidth[Mb/s]': 1.0444727143365351, 'DRAM_out_bandwidth[Mb/s]': 0.074605193881181073, 'fclk[mhz]': 49.9995, 'batch_size': 50, 'fold_input[ms]': 5.221366882324219e-05, 'pack_input[ms]': 0.6663839817047119, 'copy_input_data_to_device[ms]': 0.0001659393310546875, 'copy_output_data_from_device[ms]': 7.62939453125e-05, 'unpack_output[ms]': 0.011673688888549805, 'unfold_output[ms]': 3.457069396972656e-05}


### Accuracy Test
Perform inference on testing dataset and compare to actual values.

In [46]:
#accel.batch_size(229538)
print(accel.ishape_normal)

(50, 14)


In [36]:
datasets = load_split_np_data()
print(datasets)

X_train shape: (229538, 14)
X_val shape: (12752, 14)
X_test shape: (12752, 14)
Y_train shape: (229538, 1)
Y_val shape: (12752, 1)
Y_test shape: (12752, 1)
Using saved split data
[array([[  5.80000000e+01,   2.00000000e+00,   3.69370341e+00, ...,
          1.64524792e+03,   1.56832642e+03,   9.73000366e+02],
       [  7.30000000e+01,   3.00000000e+00,   1.63631487e+00, ...,
          1.09457141e+03,   8.94265137e+02,   7.37251587e+02],
       [  7.40000000e+01,   2.00000000e+00,   1.15914593e+01, ...,
          4.53916113e+03,   1.46776880e+03,   7.66278015e+02],
       ..., 
       [  6.40000000e+01,   2.00000000e+00,   9.21862221e+00, ...,
          1.73535828e+03,   1.19223145e+03,   6.71937500e+02],
       [  7.10000000e+01,   2.00000000e+00,   7.65338302e-01, ...,
          8.91962585e+02,   1.62368567e+03,   8.88209412e+02],
       [  9.10000000e+01,   2.00000000e+00,   8.17640972e+00, ...,
          2.95410669e+03,   1.74744006e+03,   8.36169189e+02]]), array([[  5.70000000e+01, 

In [19]:
inps = datasets[1]
exp_out = datasets[4]
valid_size = len(exp_out)
num_batches = int(valid_size/batch_size)
running_error_square = 0
exp_act_out = [[],[]]
for i in range(0, num_batches):
    batch = inps[(i*batch_size):((i+1)*batch_size)]
    batch_exp_out = exp_out[(i*batch_size):((i+1)*batch_size)]
    #print(batch.shape)
    proc_batch = pfi.preproc(batch)
    #proc_batch = np.flip(proc_batch, axis=1)
    batch_out = accel.execute(proc_batch)
    batch_out = batch_out * .9649           #last mul node not absorbed into dataflow partition
    batch_proc_out = pfi.postproc(batch_out)
    batch_errs = batch_proc_out-batch_exp_out
    #print("exp: " + str(batch_exp_out) + " act: " + str(batch_proc_out) + " preproc: " + str(batch_out))
    batch_sq_errs = batch_errs*batch_errs
    running_error_square += sum(batch_sq_errs)
    exp_act_out[0].append(batch_exp_out[0][0])
    exp_act_out[1].append(batch_proc_out[0][0])
    #batch = valid[(i*batch_size):((i+1)*batch_size)]
    #inp = batch[0]
    #inp = FACILE_preproc(inp.float())
    #inp = inp.numpy()
    #inp_dict = {in_tensor : inp}
    #exp_out = batch[1].numpy()
    #out_dict = exe_onnx(model, inp_dict)
    #out = out_dict[out_tensor]
    #out = tensor(out,dtype=float32)
    #out = FACILE_postproc(out)
    #print(out)
print("MSE: " + str(running_error_square / (num_batches * batch_size)))

MSE: [ 108.26291761]


In [47]:
#trying to max throughput
import time
proc_inps = pfi.preproc(inps)
unproc_infs = None
start_time = time.time()
for i in range(0, num_batches):
    batch = proc_inps[(i*batch_size):((i+1)*batch_size)]
    batch_out = accel.execute(batch)
    if unproc_infs is None:
        unproc_infs = batch_out
    else:
        np.concatenate([unproc_infs,batch_out])
end_time = time.time()
throughput = (num_batches * batch_size)/(end_time-start_time)
print(throughput)

73.72876009743612


In [13]:
import pickle
save_loc = 'exp_act_comparisons/fpga_deploy.pkl'
with open(save_loc, 'wb') as file:
    pickle.dump(exp_act_out, file)