In [1]:
import sys
import os

sys.path.append(os.path.abspath("../common"))

import math
import time
import numpy as np
import cv2
import pynq
import dac_sdc
import ctypes

team_name = 'SJTU_microe'
team = dac_sdc.Team(team_name, batch_size = 64)

In [2]:
last_bias = np.load('last_bias.npy')
last_bias = last_bias.reshape((6, 6))[:, :4]
cfuns = ctypes.cdll.LoadLibrary("./load_image.so")

overlay = pynq.Overlay(team.get_bitstream_path())
dma = overlay.axi_dma_0
nn_ctrl = overlay.UltraNet_Bypass_0
print('got nn accelerator!')

got nn accelerator!


In [3]:
BATCH_SIZE = team.batch_size
reps = int(math.log2(BATCH_SIZE))
IMAGE_RAW_ROW = 360
IMAGE_RAW_COL = 640
IMAGE_ROW = 160
IMAGE_COL = 320
GRID_ROw = 10
GRID_COL = 20
X_SCALE = IMAGE_RAW_COL / IMAGE_COL
Y_SCALE = IMAGE_RAW_ROW / IMAGE_ROW

in_buffer0 = pynq.allocate(shape=(BATCH_SIZE, IMAGE_RAW_ROW, IMAGE_RAW_COL, 3), dtype=np.uint8, cacheable = 1)
in_buffer1 = pynq.allocate(shape=(BATCH_SIZE, IMAGE_RAW_ROW, IMAGE_RAW_COL, 3), dtype=np.uint8, cacheable = 1)
in_buffers = [in_buffer0, in_buffer1]
out_buffer0 = pynq.allocate(shape=(BATCH_SIZE, GRID_ROw, GRID_COL, 6, 6), dtype=np.int32, cacheable = 1)
out_buffer1 = pynq.allocate(shape=(BATCH_SIZE, GRID_ROw, GRID_COL, 6, 6), dtype=np.int32, cacheable = 1)
out_buffers = [out_buffer0, out_buffer1]

# use c code load image
def load_image(image_paths, buff):
    paths = [str(path) for path in image_paths]
    tmp = np.asarray(buff)
    dataptr = tmp.ctypes.data_as(ctypes.c_char_p)
    paths_p_list = [ctypes.c_char_p(bytes(str_, 'utf-8')) for str_ in paths]
    paths_c = (ctypes.c_char_p*len(paths_p_list))(*paths_p_list)
    
    cfuns.load_image(paths_c, dataptr, len(paths), IMAGE_ROW, IMAGE_COL, 3)
    
    
def sigmoid(x):
    s = 1 / (1 + np.exp(-x))
    return s

def yolo(out_buffer, batch_n, div, last_bias=None, result=None):
    res_np = np.array(out_buffer[:batch_n]).reshape(batch_n, -1, 6, 6)
    conf = res_np[...,4].sum(axis=2)
    max_index = conf.argmax(1)
    
    grid_x = max_index % GRID_COL
    grid_y = max_index // GRID_COL
    
    boxs = np.zeros((batch_n, 6, 4))
    for i in range(batch_n):
        boxs[i, :, :] = res_np[i, max_index[i], :, :4] / div + last_bias
    xy = sigmoid(boxs[..., :2]).mean(axis=1)
    wh = np.exp(boxs[..., 2:4]).mean(axis=1)
    
    xy[:, 0] += grid_x
    xy[:, 1] += grid_y

    xy *= 16
    wh *= 20

    xy[:, 0] *= X_SCALE
    xy[:, 1] *= Y_SCALE
    wh[:, 0] *= X_SCALE
    wh[:, 1] *= Y_SCALE
    xmin = xy[:, 0] - wh[:, 0] / 2
    xmax = xy[:, 0] + wh[:, 0] / 2
    ymin = xy[:, 1] - wh[:, 1] / 2
    ymax = xy[:, 1] + wh[:, 1] / 2
    
    for i in range(batch_n):
        temp = [int(xmin[i]), int(xmax[i]), int(ymin[i]), int(ymax[i])]
        result.append(temp)

which_buffer = 0
first_batch = True
net_cnt = 0
last_batch_size = BATCH_SIZE

def net(img_paths, result):
    global first_batch
    global which_buffer    
    global net_cnt
    global last_batch_size
    # buffer first batch
    if first_batch == True:
        first_batch = False
        which_buffer = 0
        load_image(img_paths, in_buffers[which_buffer])
        return
    # count
    net_cnt += 1
    nn_ctrl.write(0x0, 0) # Reset
    nn_ctrl.write(0x10, reps)
    nn_ctrl.write(0x0, 1) # Deassert reset
    dma.recvchannel.transfer(out_buffers[which_buffer])
    dma.sendchannel.transfer(in_buffers[which_buffer])
    

    # switch buffer
    if which_buffer == 0:
        which_buffer = 1
    else:
        which_buffer = 0
        
    # buffer next batch
    if img_paths is not None:
        load_image(img_paths, in_buffers[which_buffer])
        
    # yolo 
    if net_cnt > 1:
        yolo(out_buffers[which_buffer], BATCH_SIZE, 7 * 15, last_bias, result)
    
    if img_paths is not None and len(img_paths) != BATCH_SIZE:
        last_batch_size = len(img_paths)
    
    dma.sendchannel.wait()
    dma.recvchannel.wait()
    # last batch 
    if img_paths is None:
        yolo(out_buffers[(which_buffer + 1) % 2], last_batch_size, 7 * 15, last_bias, result) 
        
################################Inference##################################
interval_time = 0
total_time = 0
total_energy = 0
result = list()
team.reset_batch_count()

rails = pynq.get_rails()

    
recorder = pynq.DataRecorder(rails["5V"].power)

start = time.time()
with recorder.record(0.05): 
    while True:
        image_paths = team.get_next_batch()
        net(image_paths, result)

        if image_paths is None:
            break

end = time.time()
t = end - start
# Energy measurements    
power  = recorder.frame["5V_power"].mean() 
energy = power * t

total_time = t
total_energy = energy
print("Total time:", total_time, "seconds")
print("power:", power ,"W")
print("Total energy:", total_energy, "J")
print('images nums: {} .'.format(len(result)))
print('fps: {} .'.format(len(result) / total_time))


Total time: 227.96575498580933 seconds
power: 4.92153863585 W
Total energy: 1121.94227081 J
images nums: 52500 .
fps: 230.29774802477715 .


In [4]:
del in_buffer0
del in_buffer1
del out_buffer0
del out_buffer1

print(len(result))
team.save_results_xml(result, total_time, total_energy)
print("XML results written successfully.")


52500
XML results written successfully.
