In [1]:
%matplotlib inline
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
weights = np.array( [[1,2,3,4],[5,6,7,8],[9,10,11,12]], dtype=np.float32)
inputs = np.array([[0.1,0.2,0.3]], dtype=np.float32)
outputs = np.dot(inputs, weights)

print ("Inputs (Shape):\n", inputs.shape)
print ("Output (Shape):\n", outputs.shape)
print ("Weights (Shape):\n", weights.shape)


print ("Inputs:\n", inputs)
print ("Weights:\n", weights)
print ("Output:\n", outputs)

print ()
print ('Input\t\t\t Weights\t\t\t  Output')
print ( inputs[0], '   . \t', weights[0], '\t\t= ', outputs[0])
for i in range(1,3): print ('\t\t\t', weights[i])


Inputs (Shape):
 (1, 3)
Output (Shape):
 (1, 4)
Weights (Shape):
 (3, 4)
Inputs:
 [[0.1 0.2 0.3]]
Weights:
 [[ 1.  2.  3.  4.]
 [ 5.  6.  7.  8.]
 [ 9. 10. 11. 12.]]
Output:
 [[3.8000002 4.4       5.        5.6000004]]

Input			 Weights			  Output
[0.1 0.2 0.3]    . 	 [1. 2. 3. 4.] 		=  [3.8000002 4.4       5.        5.6000004]
			 [5. 6. 7. 8.]
			 [ 9. 10. 11. 12.]


In [3]:
# how its done in dot.sv
def pydot(inputs,weights):
    inputs = inputs[0] # remove outer nesting
    outs = np.zeros(weights.shape[1], dtype=np.float32)
    for i in range(weights.shape[0]): # input length
        for j in range(weights.shape[1]): # output length
            outs[j] = outs[j] + weights[i][j] * inputs[i]
    return outs

# my results
print (pydot(inputs,weights))
# reference results
print (np.dot(inputs, weights)[0])

[3.8000002 4.4       5.        5.6000004]
[3.8000002 4.4       5.        5.6000004]


In [4]:
from pynq import Overlay
from pynq import MMIO
from pynq import allocate

class HwDot():
    def __init__(self, bitstream):
        self.overlay = Overlay(bitstream)        
        self.dma = self.overlay.axi_dma_0
        
        self.input_buffer = allocate(shape=(20,), dtype=np.float32)
        self.output_buffer = allocate(shape=(10,), dtype=np.float32)
        
    def dot(self, inputs):
        
        np.copyto(self.input_buffer, inputs)
            
        self.dma.sendchannel.transfer(self.input_buffer)
        self.dma.recvchannel.transfer(self.output_buffer)

        self.dma.sendchannel.wait()
        self.dma.recvchannel.wait()
        
        return self.output_buffer

        

In [5]:
with open('weights.json') as f:
    weights= np.array(json.load(f))
with open('inputs.json') as f:
    inputs= json.load(f)

# software
sw_outputs = np.dot( [inputs], weights)
print (sw_outputs)

unpipe_dot = HwDot('unpipelined.bit')

unpipe_outputs = unpipe_dot.dot(inputs)
print (unpipe_outputs)

def approx_equal( v0, v1, error = 1E-6):
    results = []
    for (x, y) in zip (v0, v1):
        if (abs(x-y) < error):  
            results.append(True)
        else: results.append(False)
    return results

equal = approx_equal(sw_outputs[0], unpipe_outputs)
print (equal)

print ('Equal: ', all(equal))

[[-0.39490299 -0.45582341 -0.33740613 -0.46385369 -0.32445234  0.29448395
   1.21528153 -0.15059742 -0.17037034 -0.13412056]]
[-0.3949029  -0.4558233  -0.33740613 -0.4638537  -0.32445237  0.294484
  1.2152815  -0.15059741 -0.17037027 -0.13412057]
[True, True, True, True, True, True, True, True, True, True]
Equal:  True


In [6]:
import timeit as tt

def py_test():  return pydot( [inputs], weights)
    
def np_test():  return np.dot( [inputs], weights)

def unpipe_test(): return unpipe_dot.dot(inputs)
    
py_out = py_test()
np_out = np_test()
unpipe_out = unpipe_test()
print ("Py vs. Np: Equal: ", all(approx_equal(py_out, np_out[0])))
print ("Np vs. Unpipe: Equal: ", all(approx_equal(np_out[0], unpipe_out)))       
print ()

print("Timing Python")
time = tt.timeit(py_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

print("Timing Numpy")
time = tt.timeit(np_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

print("Timing Unpipelined Hardware")
time = tt.timeit(unpipe_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

Py vs. Np: Equal:  True
Np vs. Unpipe: Equal:  True

Timing Python
Total Time:15.018473377916962 seconds

Timing Numpy
Total Time:0.0773667530156672 seconds

Timing Unpipelined Hardware
Total Time:0.7320800370071083 seconds



## Update your Bitstream with a Pipelined Dot, then run this block

In [7]:
import timeit as tt

pipe_dot = HwDot('bitstream.bit')

def pipe_test(): return pipe_dot.dot(inputs)

pipe_out = pipe_test()
print ("Pipe vs. UnPipe: Equal: ", all(approx_equal(pipe_out, unpipe_out)))       
    
print("Timing Pipelined Hardware")
time = tt.timeit(pipe_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

Pipe vs. UnPipe: Equal:  True
Timing Pipelined Hardware
Total Time:0.7323577431961894 seconds

