In [27]:
import serial
import numpy as np
import time

In [28]:
UINT8_MAX = 255
UNT8_MAX = 127

row_cnt = 15 # N_ROW_CACHED in verilog

M = 256
N = 256 # max is limited by block ram 1024 currently -> Only 30 usable 30*4kbit/8 = 15.36 kbyte = 15 * 1024byte
K = 225 # should be multiple of row_cnt

assert not(K % row_cnt)

m1 = np.random.randn(M, N)
m2 = np.random.randn(N, K)

m3 = np.matmul(m1, m2)

m1_sf = UNT8_MAX / max(abs(m1.min()), abs(m1.max()))
m2_sf = UNT8_MAX / max(abs(m2.min()), abs(m2.max()))
m3_sf = UNT8_MAX / max(abs(m3.min()), abs(m3.max()))

m1_q = (np.round(m1_sf * m1)).astype(np.int8)
m2_q = (np.round(m2_sf * m2)).astype(np.int8)
m3_q = (np.round(m3_sf * m3)).astype(np.int8)

norm_factor = m3_sf / (m1_sf * m2_sf)
p = np.floor(np.log2(255) - np.log2(norm_factor))

scale = (np.floor(norm_factor * 2**p)).astype(np.uint8)
shift = p.astype(np.uint8)
scale_shift = np.array([scale, shift], dtype=np.uint8)
print("scale, shift: ", scale, shift, " norm: ", norm_factor)

length_N = np.array([((N - 1) // 256), (N - 1) & 0x00FF], dtype=np.uint8)
length_M = np.array([((M - 1) // 256), (M - 1) & 0x00FF], dtype=np.uint8)
print('N: ', length_N, N)
print('M: ', length_M, M)

m3_device = np.zeros((M, K), dtype=np.int8)

scale, shift:  252 17  norm:  0.001928974155927864
N:  [  0 255] 256
M:  [  0 255] 256


In [29]:
s = serial.Serial("/dev/ttyACM3", write_timeout=3)


start = time.time()
for i in range(0, K, row_cnt):
    data = np.concatenate((length_N, length_M, scale_shift,
                           np.transpose(m2_q[:, i:(i+row_cnt)]).flatten()))

    data_b = bytes(data.astype(np.uint8))
    s.write(data_b)
    for j in range(M):
        data_b = bytes(m1_q[j,:].astype(np.uint8))
        s.write(data_b)
        data_back = s.read(row_cnt)
        o_column = np.array(list(data_back)).astype(np.int8)
        m3_device[j, i:(i+row_cnt)] = o_column

end = time.time()
print("MxN NxK:", end - start, ' seconds')
print((2*N*M*K + M*K)/ (end-start) / 1e6, "MOPS (int8)" )

s.close()   

diff = (m3_device - m3_q).flatten().astype(np.single)
print('max_diff: ', diff.max())
m_orig = m3_q.flatten().astype(np.single)

print("SNR: ", 10 * np.log10(np.sum(m_orig * m_orig) / np.sum(diff * diff)))

MxN NxK: 2.08961820602417  seconds
14.140765004254668 MOPS (int8)
max_diff:  1.0
SNR:  30.67455291748047


In [None]:
#512 512 256 6.7