### Import relevant libraries and files

In [None]:
from pynq import allocate
from pynq import Overlay
import numpy as np
import pynq.lib.dma
import time
import pynq
import pandas as pd

In [None]:
#import weights and bias from trained model if unable to hardcode import test data
# weight_0 = np.load('weight_0.npy')


In [None]:
# import test data
test = np.loadtxt("dataset/test.csv", delimiter=',', skiprows=1, usecols=np.arange(0,11))
data_label = pd.read_csv("dataset/test.csv")['Activity']

In [None]:
test_size = 300
np.random.seed(2)
a = np.arange(2000)
np.random.shuffle(a)
a = a[:test_size]
testset = test[a]
testset.shape

### FPGA

In [None]:
# load bitstream inside FPGA
overlay = Overlay('bitstream.bit')  
# dma module
dma = overlay.axi_dma_0    
input_buffer0 = allocate(shape=(12,), dtype=np.float32)
input_buffer1 = allocate(shape=(48,), dtype=np.float32)
input_buffer2 = allocate(shape=(36,), dtype=np.float32)
input_buffer3 = allocate(shape=(24,), dtype=np.float32)
input_buffer4 = allocate(shape=(12,), dtype=np.float32)
input_buffer5 = allocate(shape=(6,), dtype=np.float32)
output_buffer0 = allocate(shape=(6,), dtype=np.float32)

In [None]:
import asyncio
async_result = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

async def calculation():
    global async_result
    dma.sendchannel.transfer(input_buffer0)
    dma.recvchannel.transfer(output_buffer0)
    await asyncio.ensure_future(dma.sendchannel.wait_async())
    await asyncio.ensure_future(dma.recvchannel.wait_async())
    async_result = np.row_stack((async_result, output_buffer0))

In [None]:
# fpga calculation
def fpga_evaluate_async(testcount, test):
    loop = asyncio.get_event_loop()
    global async_result
    async_result = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
    for i in range(test_size):
        for j in range(11):
            input_buffer0[j] = test[i][j];
        loop.run_until_complete(calculation())
    return async_result[1:]

In [None]:
def fpga_evaluate(testcount, test):
    result = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
    for i in range(test_size):
        for j in range(11):
            input_buffer0[j] = test[i][j];
        dma.sendchannel.transfer(input_buffer0)
        dma.recvchannel.transfer(output_buffer0)
        dma.sendchannel.wait()
        dma.recvchannel.wait()
        result = np.row_stack((result, output_buffer0))
    return result[1:]

### Ultra96

In [None]:
# evaluation using cpu
buffer_0 = np.zeros(64);
buffer_1 = np.zeros(64);
buffer_2 = np.zeros(64);
buffer_3 = np.zeros(6);
buffer_4 = np.zeros(6);


def cpu_evaluate(testcount, test):
    result = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
    for i in range(testcount):
        for j in range(64):
            buffer_0[j] = np.dot(test[i], weight_0[:, [j]]) + bias_0[j]
            if buffer_0[j] < 0:
                buffer_0[j] = 0
        for j in range(64):
            buffer_1[j] = np.dot(buffer_0, weight_1[:, [j]]) + bias_1[j]
            if buffer_1[j] < 0:
                buffer_1[j] = 0
        for j in range(64):
            buffer_2[j] = np.dot(buffer_1, weight_2[:, [j]]) + bias_2[j]
            if buffer_2[j] < 0:
                buffer_2[j] = 0
        for j in range(6):
            buffer_3[j] = np.dot(buffer_2, weight_3[:, [j]]) + bias_3[j]
        result = np.row_stack((result, buffer_3))
    return result[1:]

### Power Metric

In [None]:
rails = pynq.get_rails()
recorder1 = pynq.DataRecorder(rails['PSINT_FP'].power, rails['PSPLL'].power)
recorder2 = pynq.DataRecorder(rails['PSINT_FP'].power, rails['PSPLL'].power)

### FPGA Performance

In [None]:
# record time taken for fpga to predict and present as graph
fpga_time = time.time()

recorder1.reset()
with recorder1.record(0.00001):
    time.sleep(1)
    recorder1.mark()
    timestart = time.time()
    fpga_res = fpga_evaluate(test_size, testset)
    timeend = time.time()
    recorder1.mark()
    time.sleep(1)
    fpga_time = timeend-timestart
print('Time taken = ' + str(fpga_time))    
recorder1.frame.plot(subplots=True)
fpga_res = np.argmax(fpga_res, axis=-1)

In [None]:
fpga_async_time = time.time()

recorder1.reset()
with recorder1.record(0.01):
    time.sleep(1)
    recorder1.mark()
    timestart = time.time()
    fpga_res_async = fpga_evaluate_async(test_size, testset)
    timeend = time.time()
    recorder1.mark()
    time.sleep(1)
    fpga_async_time = timeend-timestart
print('Time taken = ' + str(fpga_async_time))    
recorder1.frame.plot(subplots=True)
fpga_res_async = np.argmax(fpga_res_async, axis=-1)

### CPU Performance

In [None]:
# record time taken for cpu to predict and present as graph
cpu_time = time.time()

recorder2.reset()
with recorder2.record(0.01):
    time.sleep(1)
    recorder2.mark()
    timestart = time.time()
    cpu_res = cpu_evaluate(test_size, testset)
    timeend = time.time()
    recorder2.mark()
    time.sleep(1)
    cpu_time = timeend-timestart
print('Time taken = ' + str(cpu_time))    
recorder2.frame.plot(subplots=True)
cpu_res = np.argmax(cpu_res, axis=-1)

### Speed and Accuracy

### Synchronous 

In [None]:
print('Results matched = ' + str(np.sum((cpu_res == fpga_res).astype(int))/cpu_res.shape[0] * 100) + '%')
print('Speedup factor = ' + str(cpu_time/fpga_time))

In [None]:
labels = np.array(['LAYING', 'SITTING', 'STANDING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS'])
decoded_predictions = labels[fpga_res]

In [None]:
from IPython.display import display
dataset = pd.DataFrame({'FPGA predictions': decoded_predictions, 'Actual test label': data_label[a]}).reset_index(drop=True)
pd.set_option('display.max_rows', 300)
pd.set_option('display.height', 300)
display(dataset)

In [None]:
print('Results matched = ' + str(np.sum((decoded_predictions == data_label[a]).astype(int))/decoded_predictions.shape[0] * 100) + '%')

### Asynchronous

In [None]:
print('Results matched = ' + str(np.sum((cpu_res == fpga_res_async).astype(int))/cpu_res.shape[0] * 100) + '%')
print('Speedup factor = ' + str(cpu_time/fpga_async_time))

In [None]:
from pynq import ps
ps.Clocks.fclk0_mhz = 50
ps.Clocks.fclk1_mhz = 50
ps.Clocks.fclk2_mhz = 50
ps.Clocks.fclk3_mhz = 50

In [None]:
recorder1.reset()
with recorder1.record(0.01):
    time.sleep(1)
    recorder1.mark()
    timestart = time.time()
    fpga_res = fpga_evaluate(test_size, testset)
    timeend = time.time()
    recorder1.mark()
    time.sleep(1)
    fpga_time = timeend-timestart
print('Time taken = ' + str(fpga_time))    
recorder1.frame.plot(subplots=True)
fpga_res = np.argmax(fpga_res, axis=-1)

In [None]:
ps.Clocks.fclk0_mhz

In [None]:
from pynq import pl
pl.HWH

In [None]:
ps.Clocks.fclk0_mhz = 50
ps.Clocks.fclk1_mhz = 50
ps.Clocks.fclk2_mhz = 50
ps.Clocks.fclk3_mhz = 50
rails = pynq.get_rails()
rails

In [None]:
ps.Clocks.fclk0_mhz = 100
ps.Clocks.fclk1_mhz = 100
ps.Clocks.fclk2_mhz = 100
ps.Clocks.fclk3_mhz = 100
rails = pynq.get_rails()
rails