In [1]:
from extern_funcs_cython import interpolate, Fusion, ln, lstm_state_calculator
from keyframe_buffer import KeyframeBuffer
from pynq import Overlay, allocate
import numpy as np
import math
import time
import nngen_ctrl as ng

In [2]:
params = np.load("../params_nngen/params.npz")['arr_0']
inputs_npz = np.load("../params_nngen/inputs.npz")
outputs_npz = np.load("../params_nngen/outputs.npz")
predictions_npz = np.load("../params_nngen/predictions.npz")
intrinsics_npz = np.load("../params_nngen/intrinsics.npz")

In [3]:
output_files = ['feature_half',
                'cell_state',
                'hidden_state',
                'depth_org']
cell_state_idx = output_files.index('cell_state')
input_files = ['reference_image',
               'hidden_state',
               'cell_state']
hidden_state_idx = len(output_files) + input_files.index('hidden_state')

In [4]:
chunk_size = 64
def get_end_addr(addr, memory_size):
    return int(math.ceil((addr + memory_size) / chunk_size)) * chunk_size

def shape2size(shape):
    size = 1
    for s in shape:
        size *= s
    return size

In [5]:
axi_datawidth = 128
act_bit = 16
num_align_words = axi_datawidth // act_bit

output_offset = 0
outputs = []
output_aligned_shapes = []
addrs = [output_offset]
for file in output_files:
    output = outputs_npz[file]
    output_aligned_shape = (*output.shape[:-1], int(math.ceil(output.shape[-1] / num_align_words)) * num_align_words)
    outputs.append(output)
    output_aligned_shapes.append(output_aligned_shape)
    addrs.append(get_end_addr(addrs[-1], shape2size(output_aligned_shape) * (act_bit // 8)))

input_offset = addrs[-1]
inputs = []
for file in input_files:
    input = inputs_npz[file]
    lack = (num_align_words - (input.shape[-1] % num_align_words)) % num_align_words
    if lack > 0:
        input = np.append(input, np.zeros(list(input.shape[:-1]) + [lack], dtype=input.dtype), axis=input.ndim-1)
    input = np.reshape(input, [-1]).astype(np.int16)
    inputs.append(input)
    addrs.append(get_end_addr(addrs[-1], input.size * (act_bit // 8)))
cell_state_offset = addrs[-2] # change output cell_state addr to input addr
addrs[cell_state_idx] = cell_state_offset
hidden_state_offset = 176869824
addrs[hidden_state_idx] = hidden_state_offset
param_offset = addrs[-1]
print(output_offset, input_offset, param_offset)
print(addrs)

0 208896 319488
[0, 313344, 104448, 110592, 208896, 176869824, 313344, 319488]


In [6]:
bitfile = 'design_1.bit'
ipname = 'dvmvs_0'

overlay = Overlay(bitfile)
# overlay.ip_dict
ip = ng.nngen_core(overlay, ipname)

In [7]:
memory_size = 1024 * 1024 * 192
buf = allocate(shape=(memory_size,), dtype=np.uint8)
buf[param_offset:param_offset + params.size] = params.view(np.int8)

In [8]:
ip.set_global_buffer(buf)
ip.write_buffer_address(cell_state_idx, cell_state_offset)
for i in range(7):
    print(ip.read_buffer_address(i))

0
313344
104448
110592
208896
307200
313344


In [9]:
def prepare_input_value(value, lshift):
    ret = value * (1 << lshift)
    ret = np.clip(ret, -1 * 2 ** (16 - 1) - 1, 2 ** (16 - 1))
    return np.round(ret.astype(np.float64)).astype(np.int16)

In [10]:
reference_image_preds = predictions_npz["reference_image"]
feature_halfs = predictions_npz["feature_half"]
measurement_features = predictions_npz["measurement_features"]
hidden_states = predictions_npz["hidden_state"]
cell_states = predictions_npz["cell_state"]

n_measurement_frames = intrinsics_npz["n_measurement_frames"]
reference_images = intrinsics_npz["reference_image"]
reference_poses = intrinsics_npz["reference_pose"]

max_n_measurement_frames = 2
min_depth = 0.25
max_depth = 20.0
inverse_depth_base = 1 / max_depth
inverse_depth_multiplier = 1 / min_depth - 1 / max_depth

test_keyframe_buffer_size = 30
test_keyframe_pose_distance = 0.1
test_optimal_t_measure = 0.15
test_optimal_R_measure = 0.0

calc = lstm_state_calculator(intrinsics_npz, prepare_input_value, 14-1)
org_hidden_state = prepare_input_value(intrinsics_npz["hidden_state"][0].transpose(0, 2, 3, 1), 14-1).reshape(-1)
org_cell_state = prepare_input_value(intrinsics_npz["cell_state"][0].transpose(0, 2, 3, 1), 12).reshape(-1)

reference_pads = np.zeros([1, 64, 96, 5], dtype=np.int16)

In [11]:
half_K = intrinsics_npz["half_K"]
pose1s = intrinsics_npz["current_pose"]
pose2ss = intrinsics_npz["measurement_poses"]
fusion = Fusion(11, half_K, pose1s, pose2ss)

# opcode -> (func, input.addr, input.aligned_shape, output.addr, output.aligned_shape)
externs = {0x79: (fusion, 0, (1, 32, 48, 32), 175680960, (1, 32, 48, 64)),
           0x102: (None, 307200, (1, 32, 48, 512), hidden_state_offset, (1, 32, 48, 512)),
           0x104: (ln(12), 176912832, (1, 2, 3, 512), 176931264, (1, 2, 3, 512)),
           0x105: (ln(12), 176937408, (1, 2, 3, 512), cell_state_offset, (1, 2, 3, 512)),
           0x107: (interpolate(4, 6, 0, 'bilinear'), 104448, (1, 2, 3, 512), 176949696, (1, 4, 6, 512)),
           0x113: (interpolate(8, 12, 0, 'bilinear'), 177035712, (1, 4, 6, 256), 177048000, (1, 8, 12, 256)),
           0x115: (interpolate(8, 12, 0, 'bilinear'), 177097152, (1, 4, 6, 8), 177097536, (1, 8, 12, 8)),
           0x120: (interpolate(16, 24, 0, 'bilinear'), 177200448, (1, 8, 12, 128), 177225024, (1, 16, 24, 128)),
           0x122: (interpolate(16, 24, 0, 'bilinear'), 177323328, (1, 8, 12, 8), 177324864, (1, 16, 24, 8)),
           0x127: (interpolate(32, 48, 0, 'bilinear'), 177539904, (1, 16, 24, 64), 177589056, (1, 32, 48, 64)),
           0x129: (interpolate(32, 48, 0, 'bilinear'), 177785664, (1, 16, 24, 8), 177791808, (1, 32, 48, 8)),
           0x134: (interpolate(64, 96, 0, 'bilinear'), 178357056, (1, 32, 48, 8), 178774848, (1, 64, 96, 8)),
           0x135: (interpolate(64, 96, 0, 'bilinear'), 178258752, (1, 32, 48, 32), 178381632, (1, 64, 96, 32))}

In [18]:
def run_extern(code):
    # start_time = time.time()
    
    if code == 0x102:
        return None
        
    func, input_addr, input_aligned_shape, output_addr, output_aligned_shape = externs[code]
    input = buf[input_addr:input_addr + shape2size(input_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(input_aligned_shape)
    if input.shape[-1] == 8:
        input = input[:,:,:,:1]
    output = func(input)
    if output.shape != output_aligned_shape:
        output = np.append(output, np.zeros((*output.shape[:-1], 7), dtype=output.dtype), axis=output.ndim-1)
    output = output.astype(np.int16).reshape(-1)
    buf[output_addr:output_addr + shape2size(output_aligned_shape) * (act_bit // 8)] = output.view(np.uint8)

    # print(code, time.time() - start_time)

    if code == 0x79: return input

In [20]:
keyframe_buffer = KeyframeBuffer(buffer_size=test_keyframe_buffer_size,
                                 keyframe_pose_distance=test_keyframe_pose_distance,
                                 optimal_t_score=test_optimal_t_measure,
                                 optimal_R_score=test_optimal_R_measure,
                                 store_return_indices=False)
previous = False
hidden_state = None
previous_depth = None
previous_pose = None

start_time_total = time.time()

idx = 0
depths = []
for n in range(len(reference_images)):
    start_time = time.time()
    response = keyframe_buffer.try_new_keyframe(reference_poses[n][0])

    print("evaluating %05d.png (response: %d) ..." % (n + 3, response))
    if response == 2 or response == 4 or response == 5:
        continue
    elif response == 3:
        previous = False
        hidden_state = None
        previous_depth = None
        previous_pose = None
        continue

    reference_image_value = prepare_input_value(reference_images[n].transpose(0, 2, 3, 1), 12)
    # print("ri: %.4f" % np.corrcoef(reference_image_value.reshape(-1), reference_image_preds[min(n, idx+1)].reshape(-1))[0, 1])
    reference_image_value = np.append(reference_image_value, reference_pads, axis=3).reshape(-1)
    addr = addrs[len(output_files)]
    buf[addr:addr + reference_image_value.size * (act_bit // 8)] = reference_image_value.view(np.uint8)
    # print("prep", time.time() - start_time)
    
    ip.run()

    if response == 0:
        for i in range(len(externs)):
            code = ip.wait_extern()
            ip.resume_extern()
            if i == 0:
                _, input_addr, input_aligned_shape, _, _ = externs[code]
                feature_half_value = buf[input_addr:input_addr + shape2size(input_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(input_aligned_shape)
                # print("fh: %.4f" % np.corrcoef(feature_half_value.reshape(-1), feature_halfs[min(n, idx+1)].reshape(-1))[0, 1])
                keyframe_buffer.add_new_keyframe(reference_poses[n][0], feature_half_value.copy())
        ip.wait()
        print(time.time() - start_time)
        continue

    # start_time_1 = time.time()
    ### prepare fusion ###
    measurement_features_value = []
    frame_number_value = idx
    n_measurement_frames_value = n_measurement_frames[idx]
    for measurement_frame in keyframe_buffer.get_best_measurement_frames(reference_poses[n][0], max_n_measurement_frames):
        measurement_features_value.append(measurement_frame[1])
    # print("kfb_get", time.time() - start_time_1)

    # for i in range(len(measurement_features[idx])):
    #     if i >= len(measurement_features_value):
    #         print("mf: %d" % np.sum(measurement_features[idx][i].reshape(-1)))
    #     else:
    #         print("mf: %.4f" % np.corrcoef(measurement_features_value[i].reshape(-1), measurement_features[idx][i].reshape(-1))[0, 1])
    
    # start_time_1 = time.time()
    fusion.prep(frame_number_value, n_measurement_frames_value, measurement_features_value)
    # print("fusion.prep", time.time() - start_time_1)

    for i in range(len(externs)):
        code = ip.wait_extern()
        # print("e", time.time() - start_time)
        if i == 0:
            feature_half_value = run_extern(code)
            ip.resume_extern()

            # timing cannot meat when HW is fast, so add extern before lstm should be better
            # start_time_1 = time.time()
            if previous:
                ### prepare previous_depth and hidden_state ###
                addr = addrs[3]
                output_aligned_shape = output_aligned_shapes[3]
                depth_org = buf[addr:addr + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)
                depth_org = depth_org[:,:,:,:1]
                depth_org = (depth_org.transpose(0, 3, 1, 2) / (1 << 14)).astype(np.float32)
                inverse_depth_full = inverse_depth_multiplier * depth_org + inverse_depth_base
                previous_depth = 1.0 / inverse_depth_full
                # print("dep", time.time() - start_time_1)

                # start_time_1 = time.time()
                hidden_state_value = calc(hidden_state, previous_depth, previous_pose, reference_poses[n]).reshape(-1)
                addr = addrs[hidden_state_idx]
                buf[addr:addr + hidden_state_value.size * (act_bit // 8)] = hidden_state_value.view(np.uint8)
                # print("hs: %.4f" % np.corrcoef(hidden_state_value.reshape(-1), hidden_states[idx].reshape(-1))[0, 1])
            else:
                ### prepare hidden_state and cell_state ###
                addr = addrs[hidden_state_idx+1]
                buf[addr:addr + org_cell_state.size * (act_bit // 8)] = org_cell_state.view(np.uint8)
                addr = addrs[hidden_state_idx]
                buf[addr:addr + org_hidden_state.size * (act_bit // 8)] = org_hidden_state.view(np.uint8)
                # print("hs: %d, %d" % (np.sum(hidden_state_value.reshape(-1)), np.sum(hidden_states[idx].reshape(-1))))
            # print("lstm", time.time() - start_time_1)

            # print("fh: %.4f" % np.corrcoef(feature_half_value.reshape(-1), feature_halfs[min(n, idx+1)].reshape(-1))[0, 1])
            # start_time_1 = time.time()
            keyframe_buffer.add_new_keyframe(reference_poses[n][0], feature_half_value.copy())
            # print("kfb_add", time.time() - start_time_1)
        else:
            run_extern(code)
            ip.resume_extern()
        # ip.resume_extern()
        # print("s", time.time() - start_time)

    # start_time_1 = time.time()
    ### prepare hidden_state and cell_state ###
    addr = addrs[2]
    output_aligned_shape = output_aligned_shapes[2]
    hidden_state = buf[addr:addr + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)

    previous_pose = reference_poses[n]
    # print("post", time.time() - start_time_1)

    ip.wait()
    print(time.time() - start_time)

    if previous: depths.append(previous_depth)
    # depths.append(previous_depth)
    previous = True
    idx += 1

print(time.time() - start_time_total)

addr = addrs[3]
output_aligned_shape = output_aligned_shapes[3]
depth_org = buf[addr:addr + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)
depth_org = depth_org[:,:,:,:1]
depth_org = (depth_org.transpose(0, 3, 1, 2) / (1 << 14)).astype(np.float32)
inverse_depth_full = inverse_depth_multiplier * depth_org + inverse_depth_base
previous_depth = 1.0 / inverse_depth_full
depths.append(previous_depth)

evaluating 00003.png (response: 0) ...
0.421311616897583
evaluating 00004.png (response: 2) ...
evaluating 00005.png (response: 2) ...
evaluating 00006.png (response: 2) ...
evaluating 00007.png (response: 2) ...
evaluating 00008.png (response: 2) ...
evaluating 00009.png (response: 1) ...
0.6063096523284912
evaluating 00010.png (response: 2) ...
evaluating 00011.png (response: 2) ...
evaluating 00012.png (response: 1) ...
0.7988040447235107
evaluating 00013.png (response: 1) ...
0.6820604801177979
evaluating 00014.png (response: 1) ...
0.6630392074584961
evaluating 00015.png (response: 2) ...
evaluating 00016.png (response: 1) ...
0.663710355758667
evaluating 00017.png (response: 1) ...
0.6779754161834717
evaluating 00018.png (response: 1) ...
0.7116200923919678
evaluating 00019.png (response: 1) ...
0.8014285564422607
evaluating 00020.png (response: 1) ...
0.6953756809234619
evaluating 00021.png (response: 1) ...
0.7000925540924072
evaluating 00022.png (response: 1) ...
0.66678833961

In [15]:
predictions = predictions_npz["prediction"]
print(len(predictions), len(depths))
for prediction, depth in zip(predictions, depths):
    print(np.corrcoef(prediction.reshape(-1), depth.reshape(-1))[0, 1])

11 11
0.9981664390086105
0.9992716684080662
0.9974749900073795
0.998457450221372
0.999212052865551
0.9985128810770302
0.997018727528884
0.9988994157310715
0.999661151776754
0.9998893715393145
0.9990910682443448


In [16]:
start_time_total = time.time()
ip.run()
start_time = time.time()
for i in range(len(externs)):
    code = ip.wait_extern()
    print(time.time() - start_time)
    ip.resume_extern()
    start_time = time.time()
ip.wait()
print(time.time() - start_time)
print(time.time() - start_time_total)

0.2283163070678711
0.05095362663269043
0.020037174224853516
0.00016832351684570312
2.47955322265625e-05
0.01046609878540039
0.004106760025024414
0.006635904312133789
0.004157543182373047
0.007704973220825195
0.008263349533081055
0.01755666732788086
0.003753185272216797
0.05599236488342285
0.4237172603607178
