In [1]:
from extern_funcs_cython import interpolate, ln, lstm_state_calculator
from fusion import fusion_quantize_cython, prep_cython
from keyframe_buffer import KeyframeBuffer
from pynq import Overlay, allocate
import numpy as np
import math
import time
import nngen_ctrl as ng

In [2]:
params = np.load("../params_nngen/params.npz")['arr_0']
inputs_npz = np.load("../params_nngen/inputs.npz")
outputs_npz = np.load("../params_nngen/outputs.npz")
# predictions_npz = np.load("../params_nngen/predictions.npz")

In [3]:
output_files = ['feature_half',
                'cell_state',
                'hidden_state',
                'depth_org']
cell_state_idx = output_files.index('cell_state')
input_files = ['reference_image',
               'hidden_state',
               'cell_state']
hidden_state_idx = len(output_files) + input_files.index('hidden_state')

In [4]:
chunk_size = 64
def get_end_addr(addr, memory_size):
    return int(math.ceil((addr + memory_size) / chunk_size)) * chunk_size

def shape2size(shape):
    size = 1
    for s in shape:
        size *= s
    return size

In [5]:
axi_datawidth = 128
act_bit = 16
num_align_words = axi_datawidth // act_bit

output_offset = 0
outputs = []
output_aligned_shapes = []
addrs = [output_offset]
for file in output_files:
    output = outputs_npz[file]
    output_aligned_shape = (*output.shape[:-1], int(math.ceil(output.shape[-1] / num_align_words)) * num_align_words)
    outputs.append(output)
    output_aligned_shapes.append(output_aligned_shape)
    addrs.append(get_end_addr(addrs[-1], shape2size(output_aligned_shape) * (act_bit // 8)))

input_offset = addrs[-1]
inputs = []
for file in input_files:
    input = inputs_npz[file]
    lack = (num_align_words - (input.shape[-1] % num_align_words)) % num_align_words
    if lack > 0:
        input = np.append(input, np.zeros(list(input.shape[:-1]) + [lack], dtype=input.dtype), axis=input.ndim-1)
    input = np.reshape(input, [-1]).astype(np.int16)
    inputs.append(input)
    addrs.append(get_end_addr(addrs[-1], input.size * (act_bit // 8)))
cell_state_offset = addrs[-2] # change output cell_state addr to input addr
addrs[cell_state_idx] = cell_state_offset
param_offset = addrs[-1]
print(output_offset, input_offset, param_offset)
print(addrs)

0 208896 319488
[0, 313344, 104448, 110592, 208896, 307200, 313344, 319488]


In [6]:
bitfile = 'design_1.bit'
ipname = 'dvmvs_0'

overlay = Overlay(bitfile)
# overlay.ip_dict
ip = ng.nngen_core(overlay, ipname)

In [7]:
memory_size = 1024 * 1024 * 192
buf = allocate(shape=(memory_size,), dtype=np.uint8)
buf[param_offset:param_offset + params.size] = params.view(np.int8)

In [8]:
ip.set_global_buffer(buf)
ip.write_buffer_address(cell_state_idx, cell_state_offset)
for i in range(7):
    print(ip.read_buffer_address(i))

0
313344
104448
110592
208896
307200
313344


In [9]:
def prepare_input_value(value, lshift):
    ret = value * (1 << lshift)
    ret = np.clip(ret, -1 * 2 ** (16 - 1) - 1, 2 ** (16 - 1))
    return np.round(ret.astype(np.float64)).astype(np.int16)

In [10]:
# reference_image_preds = predictions_npz["reference_image"]
# feature_halfs = predictions_npz["feature_half"]
# measurement_features = predictions_npz["measurement_features"]
# hidden_states = predictions_npz["hidden_state"]
# cell_states = predictions_npz["cell_state"]

max_n_measurement_frames = 2
min_depth = 0.25
max_depth = 20.0
inverse_depth_base = 1 / max_depth
inverse_depth_multiplier = 1 / min_depth - 1 / max_depth

test_keyframe_buffer_size = 30
test_keyframe_pose_distance = 0.1
test_optimal_t_measure = 0.15
test_optimal_R_measure = 0.0

# org_hidden_state = prepare_input_value(intrinsics_npz["hidden_state"][0].transpose(0, 2, 3, 1), 14-1).reshape(-1)
# org_cell_state = prepare_input_value(intrinsics_npz["cell_state"][0].transpose(0, 2, 3, 1), 12).reshape(-1)
org_hidden_state = np.zeros((3072), dtype=np.int16)
org_cell_state = np.zeros((3072), dtype=np.int16)

reference_pads = np.zeros([1, 64, 96, 5], dtype=np.int16)

In [11]:
def get_warp_grid_for_cost_volume_calculation(width, height):
    x = np.linspace(0, width - 1, num=int(width))
    y = np.linspace(0, height - 1, num=int(height))
    ones = np.ones(shape=(height, width))
    x_grid, y_grid = np.meshgrid(x, y)
    warp_grid = np.stack((x_grid, y_grid, ones), axis=-1)
    warp_grid = warp_grid.astype(np.float32).reshape(-1, 3).T
    return warp_grid
warp_grid = get_warp_grid_for_cost_volume_calculation(int(96 / 2), int(64 / 2))

def round_and_clip(input):
    info = np.iinfo(np.int16)
    return np.clip(np.round(input).astype(np.int64), info.min, info.max).astype(np.int16)

In [12]:
# opcode -> (func, input.addr, input.aligned_shape, output.addr, output.aligned_shape)
externs = {0x79: (None, 0, (1, 32, 48, 32), 175680960, (1, 32, 48, 64)),
           0x104: (ln(12), 176906688, (1, 2, 3, 512), 176925120, (1, 2, 3, 512)),
           0x105: (ln(12), 176931264, (1, 2, 3, 512), cell_state_offset, (1, 2, 3, 512)),
           0x107: (interpolate(4, 6, 0, 'bilinear'), 104448, (1, 2, 3, 512), 176943552, (1, 4, 6, 512)),
           0x113: (interpolate(8, 12, 0, 'bilinear'), 177029568, (1, 4, 6, 256), 177041856, (1, 8, 12, 256)),
           0x115: (interpolate(8, 12, 0, 'bilinear'), 177091008, (1, 4, 6, 8), 177091392, (1, 8, 12, 8)),
           0x120: (interpolate(16, 24, 0, 'bilinear'), 177194304, (1, 8, 12, 128), 177218880, (1, 16, 24, 128)),
           0x122: (interpolate(16, 24, 0, 'bilinear'), 177317184, (1, 8, 12, 8), 177318720, (1, 16, 24, 8)),
           0x127: (interpolate(32, 48, 0, 'bilinear'), 177533760, (1, 16, 24, 64), 177582912, (1, 32, 48, 64)),
           0x129: (interpolate(32, 48, 0, 'bilinear'), 177779520, (1, 16, 24, 8), 177785664, (1, 32, 48, 8)),
           0x134: (interpolate(64, 96, 0, 'bilinear'), 178350912, (1, 32, 48, 8), 178768704, (1, 64, 96, 8)),
           0x135: (interpolate(64, 96, 0, 'bilinear'), 178252608, (1, 32, 48, 32), 178375488, (1, 64, 96, 32))}

In [22]:
def run_extern(code, warped_image2s=None):
    # start_time = time.time()
    
    if code == 0x102:
        return None
        
    func, input_addr, input_aligned_shape, output_addr, output_aligned_shape = externs[code]
    input = buf[input_addr:input_addr + shape2size(input_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(input_aligned_shape)
    if input.shape[-1] == 8:
        input = input[:,:,:,:1]
    if warped_image2s is None:
        output = func(input)
    else:
        output = np.array(fusion_quantize_cython(np.array(input), warped_image2s), dtype=np.int16)
        # output = round_and_clip(np.concatenate([np.sum(input * warped_image2.astype(np.int32), axis=3, keepdims=True) for warped_image2 in warped_image2s], axis=3) / (1 << 16))
        # output = round_and_clip(np.array([np.sum(np.array(input, dtype=np.int32) * warped_image2, axis=3) for warped_image2 in warped_image2s]).transpose(1, 2, 3, 0) / (1 << 16))
    if output.shape != output_aligned_shape:
        output = np.append(output, np.zeros((*output.shape[:-1], 7), dtype=output.dtype), axis=output.ndim-1)
    output = output.astype(np.int16).reshape(-1)
    buf[output_addr:output_addr + shape2size(output_aligned_shape) * (act_bit // 8)] = output.view(np.uint8)

    # print(code, time.time() - start_time)

    if code == 0x79: return input

In [26]:
test_dataset_names = ["chess-seq-01", "chess-seq-02", "fire-seq-01", "fire-seq-02", "heads-seq-02", "office-seq-01", "office-seq-03", "redkitchen-seq-01", "redkitchen-seq-07"]
for test_dataset_name in test_dataset_names:
    print("Predicting", test_dataset_name)
    intrinsics_npz = np.load("intrinsics/%s.npz" % test_dataset_name)
    reference_images = intrinsics_npz["reference_image"]
    reference_poses = intrinsics_npz["reference_pose"]
    calc = lstm_state_calculator(intrinsics_npz, prepare_input_value, 14-1)
    half_K = intrinsics_npz["half_K"][0]
    inv_half_K = np.linalg.inv(half_K)

    keyframe_buffer = KeyframeBuffer(buffer_size=test_keyframe_buffer_size,
                                    keyframe_pose_distance=test_keyframe_pose_distance,
                                    optimal_t_score=test_optimal_t_measure,
                                    optimal_R_score=test_optimal_R_measure,
                                    store_return_indices=False)
    previous = False
    hidden_state = None
    previous_depth = None
    previous_pose = None

    start_time_total = time.time()

    idx = 0
    depths = []
    for n in range(len(reference_images)):
        start_time = time.time()
        response = keyframe_buffer.try_new_keyframe(reference_poses[n][0])

        print("evaluating %05d.png (response: %d) ..." % (n, response))
        if response == 2 or response == 4 or response == 5:
            continue
        elif response == 3:
            previous = False
            hidden_state = None
            previous_depth = None
            previous_pose = None
            continue

        reference_image_value = prepare_input_value(reference_images[n].transpose(0, 2, 3, 1), 12)
        reference_image_value = np.append(reference_image_value, reference_pads, axis=3).reshape(-1)
        addr = addrs[len(output_files)]
        buf[addr:addr + reference_image_value.size * (act_bit // 8)] = reference_image_value.view(np.uint8)
        # print("prep", time.time() - start_time)
        
        ip.run()

        if response == 0:
            for i in range(len(externs)):
                code = ip.wait_extern()
                ip.resume_extern()
                if i == 0:
                    _, input_addr, input_aligned_shape, _, _ = externs[code]
                    feature_half_value = buf[input_addr:input_addr + shape2size(input_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(input_aligned_shape)
                    keyframe_buffer.add_new_keyframe(reference_poses[n][0], feature_half_value.copy())
            ip.wait()
            print(time.time() - start_time)
            continue

        # start_time_1 = time.time()
        ### prepare fusion ###
        measurement_poses_value = []
        measurement_features_value = []
        frame_number_value = idx
        measurement_frames = keyframe_buffer.get_best_measurement_frames(reference_poses[n][0], max_n_measurement_frames)
        for measurement_frame in measurement_frames:
            measurement_poses_value.append(measurement_frame[0])
            measurement_features_value.append(measurement_frame[1])
        # print("kfb_get", time.time() - start_time_1)

        # start_time_1 = time.time()
        inv_pose2s = np.linalg.inv(np.array(measurement_poses_value))
        warped_image2s = np.array(prep_cython(len(measurement_frames), np.array(measurement_features_value), half_K, inv_half_K, reference_poses[n][0], inv_pose2s, warp_grid), dtype=np.int16)
        # print("fusion.prep", time.time() - start_time_1)

        # timing cannot meat when HW is fast, so add extern before lstm should be better
        # start_time_1 = time.time()
        if previous:
            ### prepare previous_depth and hidden_state ###
            addr = addrs[3]
            output_aligned_shape = output_aligned_shapes[3]
            depth_org = buf[addr:addr + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)
            depth_org = depth_org[:,:,:,:1]
            depth_org = (depth_org.transpose(0, 3, 1, 2) / (1 << 14)).astype(np.float32)
            inverse_depth_full = inverse_depth_multiplier * depth_org + inverse_depth_base
            previous_depth = 1.0 / inverse_depth_full
            # print("dep", time.time() - start_time_1)

            # start_time_1 = time.time()
            hidden_state_value = calc(hidden_state, previous_depth, previous_pose, reference_poses[n]).reshape(-1)
            addr = addrs[hidden_state_idx]
            buf[addr:addr + hidden_state_value.size * (act_bit // 8)] = hidden_state_value.view(np.uint8)
        else:
            ### prepare hidden_state and cell_state ###
            addr = addrs[hidden_state_idx+1]
            buf[addr:addr + org_cell_state.size * (act_bit // 8)] = org_cell_state.view(np.uint8)
            addr = addrs[hidden_state_idx]
            buf[addr:addr + org_hidden_state.size * (act_bit // 8)] = org_hidden_state.view(np.uint8)
        # print("lstm", time.time() - start_time_1)

        for i in range(len(externs)):
            code = ip.wait_extern()
            # print("e", time.time() - start_time)
            if i == 0:
                feature_half_value = run_extern(code, warped_image2s)
                ip.resume_extern()

                # start_time_1 = time.time()
                keyframe_buffer.add_new_keyframe(reference_poses[n][0], feature_half_value.copy())
                # print("kfb_add", time.time() - start_time_1)
            else:
                run_extern(code)
                ip.resume_extern()
            # print("s", time.time() - start_time)

        # start_time_1 = time.time()
        ### prepare hidden_state and cell_state ###
        addr = addrs[2]
        output_aligned_shape = output_aligned_shapes[2]
        hidden_state = buf[addr:addr + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)

        previous_pose = reference_poses[n]
        # print("post", time.time() - start_time_1)

        ip.wait()
        print(time.time() - start_time)

        if previous: depths.append(previous_depth)
        previous = True
        idx += 1

    print(time.time() - start_time_total)

    addr = addrs[3]
    output_aligned_shape = output_aligned_shapes[3]
    depth_org = buf[addr:addr + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)
    depth_org = depth_org[:,:,:,:1]
    depth_org = (depth_org.transpose(0, 3, 1, 2) / (1 << 14)).astype(np.float32)
    inverse_depth_full = inverse_depth_multiplier * depth_org + inverse_depth_base
    previous_depth = 1.0 / inverse_depth_full
    depths.append(previous_depth)

    np.savez_compressed("depths/%s" % test_dataset_name, depths=depths)

Predicting chess-seq-01
evaluating 00000.png (response: 0) ...
0.4221057891845703
evaluating 00001.png (response: 1) ...
0.642113447189331
evaluating 00002.png (response: 1) ...
1.0979959964752197
evaluating 00003.png (response: 1) ...
1.1114873886108398
evaluating 00004.png (response: 1) ...
1.0648720264434814
evaluating 00005.png (response: 1) ...
0.904282808303833
evaluating 00006.png (response: 1) ...
1.0485851764678955
evaluating 00007.png (response: 1) ...
0.996971845626831
evaluating 00008.png (response: 1) ...
1.0061488151550293
evaluating 00009.png (response: 1) ...
0.9935886859893799
evaluating 00010.png (response: 1) ...
0.47214627265930176
evaluating 00011.png (response: 1) ...
0.677609920501709
evaluating 00012.png (response: 1) ...
0.461810827255249
evaluating 00013.png (response: 1) ...
0.46420788764953613
evaluating 00014.png (response: 1) ...
0.4628124237060547
evaluating 00015.png (response: 1) ...
0.4642002582550049
evaluating 00016.png (response: 1) ...
0.4624505043

In [24]:
# predictions = predictions_npz["prediction"]
# print(len(predictions), len(depths))
# for prediction, depth in zip(predictions, depths):
#     print(np.corrcoef(prediction.reshape(-1), depth.reshape(-1))[0, 1])

11 11
0.9981648390543427
0.9992443523984911
0.9974589782005606
0.9984591642086847
0.9992527954355334
0.9986021878380238
0.9969824587282724
0.9988970198837177
0.9997805005553531
0.9998966528702832
0.9991100371905802


In [16]:
start_time_total = time.time()
ip.run()
start_time = time.time()
for i in range(len(externs)):
    code = ip.wait_extern()
    print(time.time() - start_time)
    ip.resume_extern()
    start_time = time.time()
ip.wait()
print(time.time() - start_time)
print(time.time() - start_time_total)

0.22835636138916016
0.0709683895111084
0.0001647472381591797
2.47955322265625e-05
0.01046442985534668
0.0041043758392333984
0.006622791290283203
0.004144906997680664
0.007692813873291016
0.008275985717773438
0.017543792724609375
0.0037505626678466797
0.055997371673583984
0.4231538772583008
