In [1]:
import sys
import os

sys.path.append(os.path.abspath("../common"))

import math
import time
import numpy as np
import cv2
import pynq
import dac_sdc
import ctypes
from multiprocessing import Process, Pipe, Queue, Event, Manager, Lock, Value

team_name = 'iSmart'
team = dac_sdc.Team(team_name, batch_size = 64)

In [2]:
overlay = pynq.Overlay(team.get_bitstream_path())
cfuns = ctypes.cdll.LoadLibrary("./load_image_pingpong3.so")
dma = overlay.axi_dma_0
xlnk = pynq.Xlnk()
nn_ctrl = overlay.ultra_net_0
print('got nn accelerator!')

got nn accelerator!


In [3]:
BATCH_SIZE = team.batch_size
IMAGE_RAW_ROW = 360
IMAGE_RAW_COL = 640
IMAGE_ROW = 160
IMAGE_COL = 320
GRID_ROw = 10
GRID_COL = 20
X_SCALE = IMAGE_RAW_COL / IMAGE_COL
Y_SCALE = IMAGE_RAW_ROW / IMAGE_ROW

in_buffer0 = xlnk.cma_array(shape=(BATCH_SIZE, IMAGE_RAW_ROW, IMAGE_RAW_COL, 3), dtype=np.uint8, cacheable = 1)
in_buffer1 = xlnk.cma_array(shape=(BATCH_SIZE, IMAGE_RAW_ROW, IMAGE_RAW_COL, 3), dtype=np.uint8, cacheable = 1)
in_buffers = [in_buffer0, in_buffer1]
out_buffer0 = xlnk.cma_array(shape=(BATCH_SIZE, GRID_ROw, GRID_COL, 6, 6), dtype=np.int32, cacheable = 1)
out_buffer1 = xlnk.cma_array(shape=(BATCH_SIZE, GRID_ROw, GRID_COL, 6, 6), dtype=np.int32, cacheable = 1)
out_buffers = [out_buffer0, out_buffer1]

# use c functions to load image
def load_image(image_paths, buff):
    ts = time.time()
    paths = [str(path) for path in image_paths]
    tmp = np.asarray(buff)
    dataptr = tmp.ctypes.data_as(ctypes.c_char_p)
    paths_p_list = [ctypes.c_char_p(bytes(str_, 'utf-8')) for str_ in paths]
    paths_c = (ctypes.c_char_p*len(paths_p_list))(*paths_p_list)
    cfuns.load_image(paths_c, dataptr, len(paths), IMAGE_ROW, IMAGE_COL, 3)
    return time.time() - ts
    
def sigmoid(x):
    s = 1 / (1 + np.exp(-x))
    return s

def yolo(out_buffer, batch_n,_,result):
    out_buffer_dataptr=out_buffer.ctypes.data_as(ctypes.c_char_p)
    rst=np.empty( (batch_n,4),dtype=np.int32)
    rst_dataptr=rst.ctypes.data_as(ctypes.c_char_p)
    cfuns.yolo(out_buffer_dataptr,batch_n,rst_dataptr)
    result.extend(rst.tolist())
    
which_buffer = 0
first_batch = True
net_cnt = 0
last_batch_size = BATCH_SIZE

def net(img_paths, result):
    global first_batch
    global which_buffer    
    global net_cnt
    global last_batch_size
    if first_batch == True:
        first_batch = False
        which_buffer = 0
        load_image(img_paths, in_buffers[which_buffer])
        return
    # count
    net_cnt += 1
    nn_ctrl.write(0x0, 0) # Reset
    nn_ctrl.write(0x10, in_buffers[which_buffer].shape[0])
    nn_ctrl.write(0x0, 1) # Deassert reset
    
    dma.recvchannel.transfer(out_buffers[which_buffer])
    dma.sendchannel.transfer(in_buffers[which_buffer])
    
    # switch buffer
    if which_buffer == 0:
        which_buffer = 1
    else:
        which_buffer = 0

    if img_paths is not None:
        load_image(img_paths, in_buffers[which_buffer])
    
    # yolo 
    if net_cnt > 1:
        yolo(out_buffers[which_buffer], BATCH_SIZE, 127 * 15, result)
    
    if img_paths is not None and len(img_paths) != BATCH_SIZE:
        last_batch_size = len(img_paths)

    dma.sendchannel.wait()
    dma.recvchannel.wait()

    # last batch 
    if img_paths is None:
        yolo(out_buffers[(which_buffer + 1) % 2], last_batch_size, 127 * 15, result) # 8-bit

In [4]:
time_list = []
fps_list = []
energy_list = []
num_rounds = 1

################################Inference##################################
interval_time = 0
total_time = 0
total_energy = 0
result = list()
team.reset_batch_count()

rails = pynq.get_rails()

load_time = 0
start = time.time()
recorder = pynq.DataRecorder(rails["5V"].power)
with recorder.record(0.05):
    while True:
        image_paths = team.get_next_batch()
        net(image_paths, result)
        if image_paths is None:
            break

end = time.time()
t = end - start

# Energy measurements    
energy = recorder.frame["5V_power"].mean() * t    


total_time = t
total_energy = energy
time_list.append(total_time)
energy_list.append(total_energy)
fps_list.append(len(result) / total_time)

print("Total time:", total_time, "seconds")
print("Total energy:", total_energy, "J")
print('images nums: {}'.format(len(result)))
print('fps: {}'.format(len(result) / total_time))
print()

Total time: 3.991365432739258 seconds
Total energy: 22.8304806855 J
images nums: 1000
fps: 250.54082790753242



In [5]:
team.save_results_xml(result, total_time, energy)
xlnk.xlnk_reset()