In [1]:
from extern_funcs import interpolate, fusion, celu, ln
from pynq import Overlay, allocate
import numpy as np
import math
import time
import nngen_ctrl as ng

In [2]:

params = np.load("../params_nngen/params.npz")['arr_0']
inputs_npz = np.load("../params_nngen/inputs.npz")
outputs_npz = np.load("../params_nngen/outputs.npz")
intrinsics_npz = np.load("../params_nngen/intrinsics.npz")

In [3]:
input_files = ['reference_image',
               'frame_number',
               'n_measurement_frames',
               'measurement_feature0',
               'measurement_feature1',
               'hidden_state',
               'cell_state']

output_files = ['layer1',
                'layer2',
                'layer3',
                'layer4',
                'layer5',
                'feature_one_sixteen',
                'feature_one_eight',
                'feature_quarter',
                'feature_half',
                'cost_volume',
                'skip0',
                'skip1',
                'skip2',
                'skip3',
                'bottom',
                'cell_state',
                'hidden_state',
                'depth_org']

In [4]:
chunk_size = 64
def get_end_addr(addr, memory_size):
    return int(math.ceil((addr + memory_size) / chunk_size)) * chunk_size

def shape2size(shape):
    size = 1
    for s in shape:
        size *= s
    return size

In [5]:
axi_datawidth = 128
act_bit = 16
num_align_words = axi_datawidth // act_bit

output_offset = 0
addrs = [output_offset]
output_aligned_shapes = []
outputs = []
for file in output_files:
    output = outputs_npz[file]
    output_aligned_shape = (*output.shape[:-1], int(math.ceil(output.shape[-1] / num_align_words)) * num_align_words)
    output_aligned_shapes.append(output_aligned_shape)
    outputs.append(output)
    addrs.append(get_end_addr(addrs[-1], shape2size(output_aligned_shape) * (act_bit // 8)))

input_offset = addrs[-1]
inputs = []
for file in input_files:
    input = inputs_npz[file]
    lack = (num_align_words - (input.shape[-1] % num_align_words)) % num_align_words
    if lack > 0:
        input = np.append(input, np.zeros(list(input.shape[:-1]) + [lack], dtype=input.dtype), axis=input.ndim-1)
    input = np.reshape(input, [-1]).astype(np.int16)
    inputs.append(input)
    addrs.append(get_end_addr(addrs[-1], input.size * (act_bit // 8)))
param_offset = addrs[-1]
print(output_offset, input_offset, param_offset)
print(addrs)

0 711936 1019264
[0, 49152, 67584, 75264, 79872, 83712, 85248, 91392, 115968, 214272, 410880, 509184, 558336, 582912, 595200, 601344, 607488, 613632, 711936, 810240, 810304, 810368, 908672, 1006976, 1013120, 1019264]


In [6]:
bitfile = 'design_1.bit'
ipname = 'dvmvs_0'

overlay = Overlay(bitfile)
# overlay.ip_dict
ip = ng.nngen_core(overlay, ipname)

In [7]:
memory_size = 1024 * 1024 * 192
buf = allocate(shape=(memory_size,), dtype=np.uint8)

In [8]:
for input, addr in zip(inputs, addrs[len(output_files):-1]):
    buf[addr:addr + input.size * (act_bit // 8)] = input.view(np.uint8)
buf[param_offset:param_offset + params.size] = params.view(np.int8)

In [9]:
ip.set_global_buffer(buf)

In [10]:
half_K = intrinsics_npz["half_K"]
pose1s = intrinsics_npz["current_pose"]
pose2ss = intrinsics_npz["measurement_poses"]

# opcode -> (func, [input.addr], input.aligned_shape, output.addr, output.aligned_shape)
externs = {0x63: (interpolate(4, 6, 0, 'nearest'), [175806656], (1, 2, 3, 32), 175808576, (1, 4, 6, 32)),
           0x67: (interpolate(8, 12, 0, 'nearest'), [175810112], (1, 4, 6, 32), 175817792, (1, 8, 12, 32)),
           0x71: (interpolate(16, 24, 0, 'nearest'), [175823936], (1, 8, 12, 32), 175854656, (1, 16, 24, 32)),
           0x75: (interpolate(32, 48, 0, 'nearest'), [175879232], (1, 16, 24, 32), 176002112, (1, 32, 48, 32)),
           0x80: (fusion(11, half_K, pose1s, pose2ss), [115968, 810368, 908672], (1, 32, 48, 32), 214272, (1, 32, 48, 64)),
           0x105: (lambda x : celu(12)(ln(12)(x)), [177103424], (1, 2, 3, 512), 177121856, (1, 2, 3, 512)),
           0x106: (ln(12), [177128000], (1, 2, 3, 512), 601344, (1, 2, 3, 512)),
           0x107: (celu(12), [601344], (1, 2, 3, 512), 177134144, (1, 2, 3, 512)),
           0x108: (interpolate(4, 6, 0, 'bilinear'), [607488], (1, 2, 3, 512), 177146432, (1, 4, 6, 512)),
           0x114: (interpolate(8, 12, 0, 'bilinear'), [177232448], (1, 4, 6, 256), 177244736, (1, 8, 12, 256)),
           0x116: (interpolate(8, 12, 0, 'bilinear'), [177293888], (1, 4, 6, 8), 177294272, (1, 8, 12, 8)),
           0x121: (interpolate(16, 24, 0, 'bilinear'), [177397184], (1, 8, 12, 128), 177421760, (1, 16, 24, 128)),
           0x123: (interpolate(16, 24, 0, 'bilinear'), [177520064], (1, 8, 12, 8), 177521600, (1, 16, 24, 8)),
           0x128: (interpolate(32, 48, 0, 'bilinear'), [177736640], (1, 16, 24, 64), 177785792, (1, 32, 48, 64)),
           0x130: (interpolate(32, 48, 0, 'bilinear'), [177982400], (1, 16, 24, 8), 177988544, (1, 32, 48, 8)),
           0x135: (interpolate(64, 96, 0, 'bilinear'), [178553792], (1, 32, 48, 8), 178971584, (1, 64, 96, 8)),
           0x136: (interpolate(64, 96, 0, 'bilinear'), [178455488], (1, 32, 48, 32), 178578368, (1, 64, 96, 32))}

In [11]:
frame_number = inputs_npz["frame_number"]
n_measurement_frames = inputs_npz["n_measurement_frames"]

def run_extern(code):
    start_time = time.time()
    
    func, input_addrs, input_aligned_shape, output_addr, output_aligned_shape = externs[code]
    inputs = []
    for input_addr in input_addrs:
        input = buf[input_addr:input_addr + shape2size(input_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(input_aligned_shape)
        if input.shape[-1] == 8:
            input = input[:,:,:,:1]
        inputs.append(input)
    if len(inputs) == 1:
        output = func(*inputs)
    else:
        output = func(frame_number, inputs[0], n_measurement_frames, *inputs[1:])
    if output.shape != output_aligned_shape:
        output = np.append(output, np.zeros((*output.shape[:-1], 7), dtype=output.dtype), axis=output.ndim-1)
    # print(code, output.shape)
    output = output.astype(np.int16).reshape(-1)
    buf[output_addr:output_addr + shape2size(output_aligned_shape) * (act_bit // 8)] = output.view(np.uint8)

    print(code, time.time() - start_time)


In [12]:
start_time = time.time()

ip.run()
for i in range(17):
    code = ip.wait_extern()
    run_extern(code)
    ip.resume_extern()
ip.wait()

print(time.time() - start_time)


99 0.04647946357727051
103 0.0020220279693603516
113 0.0016057491302490234
117 0.0068972110748291016
128 1.1646838188171387
261 2.26096248626709
262 0.00365447998046875
263 2.23189640045166
264 0.008161306381225586
276 0.0015597343444824219
278 0.0009458065032958984
289 0.002558469772338867
291 0.001417398452758789
296 0.0049512386322021484
304 0.0014598369598388672
310 0.012249231338500977
309 0.002284526824951172
7.7131805419921875


In [13]:
for i, output in enumerate(outputs):
    orig = buf[addrs[i]:addrs[i] + shape2size(output_aligned_shapes[i]) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shapes[i])
    orig = orig[:,:,:,:output.shape[-1]]
    print(output_files[i], np.corrcoef(output.reshape(-1), orig.reshape(-1))[0, 1])

layer1 0.9999999999999998
layer2 1.0
layer3 0.9999999893227762
layer4 0.9999999535704848
layer5 0.9999998857999312
feature_one_sixteen 0.9999999446782779
feature_one_eight 0.9999999669183328
feature_quarter 0.9999999740406471
feature_half 0.9999999666703193
cost_volume 0.9999999890399363
skip0 0.9999999727280664
skip1 0.9999999719662179
skip2 0.9999999694383696
skip3 0.9999999674678867
bottom 0.9999999744975764
cell_state 0.9999938806636451
hidden_state 0.9999959832960018
depth_org 0.9999087198577311


In [14]:
buf.freebuffer()