In [1]:
from extern_funcs import interpolate, fusion, celu, ln
from pynq import Overlay, allocate
import numpy as np
import math
import time
import nngen_ctrl as ng

In [28]:

params = np.load("params_nngen/params.npz")['arr_0']
inputs_npz = np.load("params_nngen/inputs.npz")
output = np.load("params_nngen/output.npz")['output']
intrinsics_npz = np.load("params_nngen/intrinsics.npz")

In [29]:
input_files = ['reference_image',
               'frame_number',
               'n_measurement_frames',
               'measurement_feature0',
               'measurement_feature1',
               'hidden_state',
               'cell_state']

In [30]:
chunk_size = 64
def get_end_addr(addr, memory_size):
    return int(math.ceil((addr + memory_size) / chunk_size)) * chunk_size

def shape2size(shape):
    size = 1
    for s in shape:
        size *= s
    return size

In [31]:
axi_datawidth = 128
act_bit = 16
num_align_words = axi_datawidth // act_bit

output_offset = 0
addrs = [output_offset]
output_aligned_shape = (*output.shape[:-1], int(math.ceil(output.shape[-1] / num_align_words)) * num_align_words)
addrs.append(get_end_addr(addrs[-1], shape2size(output_aligned_shape) * (act_bit // 8)))

input_offset = addrs[-1]
inputs = []
for file in input_files:
    input = inputs_npz[file]
    lack = (num_align_words - (input.shape[-1] % num_align_words)) % num_align_words
    if lack > 0:
        input = np.append(input, np.zeros(list(input.shape[:-1]) + [lack], dtype=input.dtype), axis=input.ndim-1)
    input = np.reshape(input, [-1]).astype(np.int16)
    inputs.append(input)
    addrs.append(get_end_addr(addrs[-1], input.size * (act_bit // 8)))
param_offset = addrs[-1]
print(output_offset, input_offset, param_offset)
print(addrs)

0 98304 405632
[0, 98304, 196608, 196672, 196736, 295040, 393344, 399488, 405632]


In [5]:
bitfile = 'design_1.bit'
ipname = 'dvmvs_0'

overlay = Overlay(bitfile)
# overlay.ip_dict
ip = ng.nngen_core(overlay, ipname)

In [6]:
memory_size = 1024 * 1024 * 192
buf = allocate(shape=(memory_size,), dtype=np.uint8)

In [32]:
for input, addr in zip(inputs, addrs[1:-1]):
    buf[addr:addr + input.size * (act_bit // 8)] = input.view(np.uint8)
buf[param_offset:param_offset + params.size] = params.view(np.int8)

In [8]:
ip.set_global_buffer(buf)

In [21]:
half_K = intrinsics_npz["half_K"]
pose1s = intrinsics_npz["current_pose"]
pose2ss = intrinsics_npz["measurement_poses"]

# opcode -> (func, [input.addr], input.aligned_shape, output.addr, output.aligned_shape)
externs = {0x63: (interpolate(4, 6, 0, 'nearest'), [175276736], (1, 2, 3, 32), 175278656, (1, 4, 6, 32)),
           0x67: (interpolate(8, 12, 0, 'nearest'), [175280192], (1, 4, 6, 32), 175287872, (1, 8, 12, 32)),
           0x71: (interpolate(16, 24, 0, 'nearest'), [175294016], (1, 8, 12, 32), 175324736, (1, 16, 24, 32)),
           0x75: (interpolate(32, 48, 0, 'nearest'), [175349312], (1, 16, 24, 32), 175472192, (1, 32, 48, 32)),
           0x80: (fusion(11, half_K, pose1s, pose2ss), [175668800, 196736, 295040], (1, 32, 48, 32), 175865408, (1, 32, 48, 64)),
           0x105: (lambda x : celu(12)(ln(12)(x)), [177091136], (1, 2, 3, 512), 177109568, (1, 2, 3, 512)),
           0x106: (ln(12), [177115712], (1, 2, 3, 512), 177121856, (1, 2, 3, 512)),
           0x107: (celu(12), [177121856], (1, 2, 3, 512), 177128000, (1, 2, 3, 512)),
           0x108: (interpolate(4, 6, 0, 'bilinear'), [177140288], (1, 2, 3, 512), 177146432, (1, 4, 6, 512)),
           0x114: (interpolate(8, 12, 0, 'bilinear'), [177232448], (1, 4, 6, 256), 177244736, (1, 8, 12, 256)),
           0x116: (interpolate(8, 12, 0, 'bilinear'), [177293888], (1, 4, 6, 8), 177294272, (1, 8, 12, 8)),
           0x121: (interpolate(16, 24, 0, 'bilinear'), [177397184], (1, 8, 12, 128), 177421760, (1, 16, 24, 128)),
           0x123: (interpolate(16, 24, 0, 'bilinear'), [177520064], (1, 8, 12, 8), 177521600, (1, 16, 24, 8)),
           0x128: (interpolate(32, 48, 0, 'bilinear'), [177736640], (1, 16, 24, 64), 177785792, (1, 32, 48, 64)),
           0x130: (interpolate(32, 48, 0, 'bilinear'), [177982400], (1, 16, 24, 8), 177988544, (1, 32, 48, 8)),
           0x135: (interpolate(64, 96, 0, 'bilinear'), [178553792], (1, 32, 48, 8), 178971584, (1, 64, 96, 8)),
           0x136: (interpolate(64, 96, 0, 'bilinear'), [178455488], (1, 32, 48, 32), 178578368, (1, 64, 96, 32))}

In [33]:
frame_number = inputs_npz["frame_number"]
n_measurement_frames = inputs_npz["n_measurement_frames"]

def run_extern(code):
    start_time = time.time()
    
    func, input_addrs, input_aligned_shape, output_addr, output_aligned_shape = externs[code]
    inputs = []
    for input_addr in input_addrs:
        input = buf[input_addr:input_addr + shape2size(input_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(input_aligned_shape)
        if input.shape[-1] == 8:
            input = input[:,:,:,:1]
        inputs.append(input)
    if len(inputs) == 1:
        output = func(*inputs)
    else:
        output = func(frame_number, inputs[0], n_measurement_frames, *inputs[1:])
    if output.shape != output_aligned_shape:
        output = np.append(output, np.zeros((*output.shape[:-1], 7), dtype=output.dtype), axis=output.ndim-1)
    # print(code, output.shape)
    output = output.astype(np.int16).reshape(-1)
    buf[output_addr:output_addr + shape2size(output_aligned_shape) * (act_bit // 8)] = output.view(np.uint8)

    print(code, time.time() - start_time)


In [34]:
start_time = time.time()

ip.run()
for i in range(17):
    code = ip.wait_extern()
    run_extern(code)
    ip.resume_extern()
ip.wait()

print(time.time() - start_time)

99 0.11020803451538086
103 0.0008809566497802734
113 0.0009844303131103516
117 0.0021445751190185547
128 1.0873734951019287
261 2.1880037784576416
262 0.003666400909423828
263 2.202094316482544
264 0.0017180442810058594
276 0.001550436019897461
278 0.0009791851043701172
289 0.0023622512817382812
291 0.0022974014282226562
296 0.004970073699951172
304 0.0014636516571044922
310 0.011937856674194336
309 0.002193450927734375
7.730220794677734


In [35]:

orig = buf[addrs[0]:addrs[0] + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)
orig = orig[:,:,:,:output.shape[-1]]
print(np.corrcoef(output.reshape(-1), orig.reshape(-1)))

[[ 1.        -0.5619412]
 [-0.5619412  1.       ]]


In [12]:
print(output.shape, buf[:input_offset].view(np.int16).reshape(output_aligned_shape).shape)

(1, 64, 96, 1) (1, 64, 96, 8)


In [14]:
print(output[0, 10, :, 0])

[3225 2839 2913 2913 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839
 2767 2767 2767 2767 2767 2767 2695 2695 2695 2695 2695 2695 2695 2767
 2767 2767 2767 2767 2767 2767 2767 2767 2767 2839 2839 2839 2839 2839
 2839 2913 2913 2913 2913 2913 2913 2913 2913 2839 2839 2839 2839 2839
 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839
 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839 2839
 2839 2839 2839 2913 2913 2989 2989 2989 3066 3145 3225 4015]


In [12]:
print(np.sum(buf[:input_offset].view(np.int16).reshape(output_aligned_shape)[:,16:,:,:1].reshape(-1) == 0))

4608


In [13]:
print(np.sum(buf[:input_offset].view(np.int16).reshape(output_aligned_shape)[:,:16,:,:1].reshape(-1) == 8192))

1536


In [25]:
np.corrcoef(output.reshape(-1), buf[:input_offset].view(np.int16).reshape(output_aligned_shape)[:,:,:,:1].reshape(-1))

array([[ 1.       , -0.5619412],
       [-0.5619412,  1.       ]])

In [16]:
buf.freebuffer()