# CoDeNet SW
Software source code to invoke the CoDeNet accelerator and get the latency for batch size=1 inference time 

In [1]:
from PIL import Image
import cffi
import os
import numpy as np
from pynq import Overlay, Xlnk
import pynq
import time 
import math
from scipy import signal
import logging

In [2]:
# Set the log level to INFO for now
# uncomment this to see per layer details
logging.getLogger().setLevel(logging.INFO)

In [3]:
class CoDeNetAccel:
    """CoDeNet Accelerator Convolution Implementation.

    This class keeps all functions to invoke the FPGA accelerator.
    """

    def __init__(self, MAX_D, MAX_IC, MAX_OC, PA, PE, bitfile, MAX_BATCH=1):
        """One-off initialization to define the accelerator constraints.

        Args:
            MAX_D: maximum image width and height
            MAX_IC: maximum input channel size
            MAX_OC: maximum output channel size
            PA: the number of input channels to process in every iteration
            PE: the number of output channels to process in every iteration. The total MAC per iteration is PA x PE.
            bitfile: the path to the FPGA bitflie
        """
        self.MAX_D = MAX_D
        self.MAX_IC = MAX_IC
        self.MAX_OC = MAX_OC
        self.PA = PA
        self.PE = PE
        ROOT_DIR = 'bitfile'

        # downloads the FPGA bitstream
        self.bitstream_name = os.path.join(ROOT_DIR, bitfile)
        self.overlay = Overlay(self.bitstream_name)
        self.accel = self.overlay.top_0
        self.overlay.download()

        # resets the accelerator
        self.accel.write(0x00, 0x00)
        while (self.accel.read(0x00) & 0x1):
            time.sleep(0.1)
        self.runtime = 0
        self.layercount = 0
        self.vectoraddtime = 0
        self.uptime = 0
        self.chansplit = 0
        self.convtime = 0
        self.collect = []
        self.vec_collect = []
        self.up_collect = []
        self.gops = 0
        self.peak_gops = 0
        self.shuffletime = 0
        self.batch_size = 0

        # initializes the size of the buffer
        inout_size = MAX_BATCH * MAX_IC * MAX_D * MAX_D // 2  # divided by 2 for 4-bit data
        weight_size_3x3dw = 9 * MAX_IC // 2
        addr_size = MAX_D * MAX_D
        weight_size_1x1 = MAX_IC * MAX_OC // 2
        quant_size = MAX_OC

        # allocates buffers for input args
        self.fmaps = []

        # three buffers for feature maps, used alternatively
        for _ in range(3):
            self.fmaps.append(self.get_pynq_buffer(shape=(inout_size,), dtype=np.uint8, cacheable=1))
        self.addr = self.get_pynq_buffer(shape=(addr_size,), dtype=np.uint8)
        self.weight_3x3dw = self.get_pynq_buffer(shape=(weight_size_3x3dw,), dtype=np.uint8)
        self.weight_1x1 = self.get_pynq_buffer(shape=(weight_size_1x1,), dtype=np.uint8)
        self.quant = self.get_pynq_buffer(shape=(quant_size,), dtype=np.int16, cacheable=1)

        # get the physical addresses of the buffers
        self.fmap_addrs = [fmap.physical_address for fmap in self.fmaps]
        self.addr_addr = self.addr.physical_address
        self.weight_addr_3x3dw = self.weight_3x3dw.physical_address
        self.weight_addr_1x1 = self.weight_1x1.physical_address
        self.quant_addr = self.quant.physical_address

        # randomly init the weights
        self.weight_3x3dw[:] = np.random.randint(16, size=(weight_size_3x3dw,))
        self.weight_1x1[:] = np.random.randint(16, size=(weight_size_1x1,))
        self.quant[:] = np.ones((quant_size,))

    def get_stats(self):
        """ Calculate statistics of the runs."""
        self.totaltime = self.convtime + self.vectoraddtime + self.uptime + self.chansplit + self.shuffletime

    def print_stats(self):
        """ Print statistics of the runs."""
        print("\n\tCoDeNet Stats:")
        print("\t\tconv: %4f s" % (self.convtime))
        print("\t\tup2: %4f s:" % (self.uptime))
        print("\t\tchansplit: %4f s" % (self.chansplit))
        print("\t\tchanshuffle: %4f s" % (self.shuffletime))
        print("\t\tgops: %4f OP/s" % (self.gops / self.convtime))
        print("\t\tpeak_gops: %4f OP/s" % (self.peak_gops))
        print("\t\taccel_latency: %4f s" % (self.totaltime))
        print("\t\tframerate: %2f fps" % (1 / self.totaltime))

    def get_pynq_buffer(self, shape, dtype, cacheable=0):
        """ Returns the memory buffer that is visible to the FPGA accelertors. """
        return Xlnk().cma_array(shape, dtype, cacheable=cacheable)

    def ceil(self, base, num):
        """Returns the ceiling of input number given a base factor."""
        res = num % base
        if res != 0:
            return num + base - res
        else:
            return num

    def pack_array(self, input, radix: int):
        """Pack array with radix into uint8 array."""
        size = len(input)
        time = 8 // radix
        size = size // time
        result = []
        for i in range(size):
            a = 0
            for j in range(time):
                a = a | (input[i * time + j] << (radix * j))
            result.append(a)
        return result

    def unpack_array(self, input, radix):
        """Unpack array with radix into uint8 array."""
        size = len(input)
        time = 8 // radix
        mask = (2 ** radix) - 1
        result = []
        for i in range(size):
            a = input[i]
            for j in range(time):
                b = (a >> (radix * j)) & mask
                result.append(b)
        return result

    def conv(self, x, ic, oc, stride, skip3=0, skip1=0, deform=0, relu1=1, relu3=1):
        """Deformable convolution kernel.
            Args:
                x: input tensor in numpy.
                ic: input channel size.
                oc: output channel size.
                stride: stride.
                skip3: If true, skip 3x3 dw conv.
                skip1: If true, skip 1x1 conv.
                deform: If true, run with deformable offsets.
                relu1: If true, run relu after 1x1 conv.
                relu3: If true, run relu after 3x3 dw conv.
        """
        if oc > 512:
            y = self.conv512(x, ic, oc, stride, skip3=skip3, skip1=skip1, deform=deform, relu1=relu1, relu3=relu3)
        else:
            y = self.base_conv(x, ic, oc, stride, skip3=skip3, skip1=skip1, deform=deform, relu1=relu1, relu3=relu3)
        return y

    def conv512(self, x, ic, oc, stride, skip3=0, skip1=0, deform=0, relu1=1, relu3=1):
        """Deformable convolution kernel for oc > 512."""
        oc_remain = oc
        it = 0
        ys = []
        while (oc_remain > 0):
            if (oc_remain - 512) > 0:
                oc_size = 512
            else:
                oc_size = oc_remain

            if skip1 == 0:
                ic_size = ic
                x_in = x
            else:
                ic_size = oc_size
                if (oc_remain - 512) > 0:
                    x_in = x[:, :, :, 512 * it: 512 * (it + 1)]
                else:
                    x_in = x[:, :, :, 512 * it:]
            y = self.base_conv(x_in, ic_size, oc_size, stride, skip3=skip3, skip1=skip1, deform=deform, relu1=relu1,
                               relu3=relu3)
            ys.append(y)

            it += 1
            oc_remain = oc_remain - 512

        start = time.time()
        y = np.concatenate(ys, axis=3)
        end = time.time()
        dur = end - start
        self.convtime += dur
        return y

    def update_args(self, in_addr, out_addr, weight_addr_1x1, weight_addr_3x3dw, quant_addr, addr_addr, fm_d, ic_size,
                    oc_size, batch_size, with_stride, skip3, skip1, deform, relu1, relu3):
        """Update the accelerator args.
            These can be hardcoded in memory, we expose them for more flexibility.
        """
        self.accel.write(0x10, in_addr)
        self.accel.write(0x18, out_addr)
        self.accel.write(0x20, weight_addr_1x1)
        self.accel.write(0x28, weight_addr_3x3dw)
        self.accel.write(0x30, quant_addr)
        self.accel.write(0x38, addr_addr)
        self.accel.write(0x40, fm_d)
        self.accel.write(0x48, ic_size)
        self.accel.write(0x50, oc_size)
        self.accel.write(0x58, batch_size)
        self.accel.write(0x60, with_stride)
        self.accel.write(0x68, skip3)
        self.accel.write(0x70, skip1)
        self.accel.write(0x78, deform)
        self.accel.write(0x80, relu1)
        self.accel.write(0x88, relu3)

    def base_conv(self, x, ic, oc, stride, skip3=0, skip1=0, deform=0, relu1=1, relu3=1):
        """Deformable convolution kernel.
            Args:
                x: input tensor in numpy of shape (batch_size, height, width, input_channel_size).
                ic: input channel size.
                oc: output channel size.
                stride: stride.
                skip3: If true, skip 3x3 dw conv.
                skip1: If true, skip 1x1 conv.
                deform: If true, run with deformable offsets.
                relu1: If true, run relu after 1x1 conv. For quantization, with relu, we use symmetric quantization
                    with zero point at 0.
                relu3: If true, run relu after 3x3 dw conv.
        """
        if stride > 1 and skip3 == 1:
            raise ValueError("Cannot skip 3x3 dw conv if stride is larger than 1.")

        # batch_size size
        batch_size = x.shape[0]
        # input feature map dimension
        fm_d = x.shape[1]
        # input channel size, round to PA's multiples
        ic_size = self.ceil(self.PA, ic)
        # output channel size, round to PE's multiples
        oc_size = self.ceil(self.PE, oc)

        if ic_size != x.shape[3]:
            raise ValueError("x tensor input channel count must be equal to ")

        if stride == 2:
            y = np.zeros((batch_size, fm_d // 2, fm_d // 2, oc_size), dtype=np.int8)
        else:
            y = np.zeros((batch_size, fm_d, fm_d, oc_size), dtype=np.int8)

        with_stride = 1 if stride == 2 else 0

        in_addr = self.fmap_addrs[self.layercount % 2]
        out_addr = self.fmap_addrs[(self.layercount + 1) % 2]
        weight_addr_1x1 = self.weight_addr_1x1
        weight_addr_3x3dw = self.weight_addr_3x3dw
        quant_addr = self.quant_addr
        addr_addr = self.addr_addr

        # passes the args to the accelerator
        # this can be stored to memory before invoking the accelerator
        self.update_args(in_addr, out_addr, weight_addr_1x1, weight_addr_3x3dw, quant_addr, addr_addr, fm_d,
                         ic_size, oc_size, batch_size, with_stride, skip3, skip1, deform, relu1, relu3)

        # starts the accel
        begin = time.time()
        ctrl_val = self.accel.read(0x00)
        ready = not (ctrl_val & 0x1)
        if not ready:
            raise ("Accelerator not ready!")        
        self.accel.write(0x00, 0x1)

        # checks for the done signal
        while not (self.accel.read(0x0) & 0x2):
            pass

        end = time.time()
        dur = end - begin

        op3 = batch_size * (1 - skip3) * 9 * fm_d * fm_d * ic_size * 2 / 1000000000 / stride / stride
        op1 = batch_size * (1 - skip1) * fm_d * fm_d * oc_size * ic_size * 2 / 1000000000 / stride / stride
        ops = op1 + op3
        self.gops += ops
        ops = ops / dur

        if ops > self.peak_gops:
            self.peak_gops = ops

        logging.debug("CONV: fm_d=%d ic=%d oc=%d stride=%d skip3=%d skip1=%d deform=%d Time Elapsed=%2f GOPS=%2f." % (
            fm_d, ic, oc, stride, skip3, skip1, deform, dur, ops))
        self.convtime += dur
        self.layercount += 1
        self.collect.append([fm_d, ic, oc, stride, skip3, dur, ops])

        # set y's content to the output content
        y[:] = self.fmaps[(self.layercount + 1) % 2][:y.size].reshape(y.shape)
        return y

    def upsample2(self, x):
        """Upsamples the feature map by duplicating the pixel. The input size changes from
            (d x d x ic) to (2d x 2d x ic).
        """
        fmap_in = x
        begin = time.time()
        # repeats elements on dim h and w two times
        fmap_out = fmap_in.repeat(2, 1).repeat(2, 2)
        end = time.time()
        dur = end - begin
        self.uptime += dur
        self.up_collect.append([x.shape[1], x.shape[3], dur])
        logging.debug("UP2: fm_d=%d ic=%d Time Elapsed=%2f." % (x.shape[1], x.shape[3], dur))
        return fmap_out

    def channel_split(self, x):
        """Splits the input into two branch from the channel dimension. The input size
            changes from (d x d x ic) to (d x d x ic/2).
        """
        begin = time.time()
        # take every two element on the 3rd dim
        y1 = x[:, :, :, ::2]
        # take every two element starting from the 1st element
        y2 = x[:, :, :, 1::2]
        end = time.time()
        dur = end - begin
        self.chansplit += dur
        logging.debug("SPLIT: fm_d=%d ic%d oc=%d Time Elapsed=%2f." % (x.shape[1], x.shape[3], y1.shape[3], dur))
        return y1, y2

    def concat(self, x1, x2):
        """Concats two branches together from the channel dimension."""
        begin = time.time()
        y = np.concatenate((x1, x2), axis=3)
        end = time.time()
        dur = end - begin
        dur *= self.batch_size
        self.vectoraddtime += dur
        logging.debug("CONCAT: Time Elapsed=%2f." % (dur))
        return y

    def channel_shuffle(self, x, G=2):
        """Shuffles two branches from the channel dimension."""
        begin = time.time()
        B, H, W, C = x.shape
        x = x.reshape(B, H, W, G, C // G)
        x = np.transpose(x, (0, 1, 2, 4, 3))
        x = x.reshape(B, H, W, C)
        end = time.time()
        dur = end - begin
        self.shuffletime += dur
        logging.debug("SHUFFLE: Time Elapsed=%2f." % (dur))
        return x

    def concat_and_shuffle(self, x1, x2):
        """Concats and shuffles two branches at the channel dimension."""
        begin = time.time()
        out = np.zeros((x1.shape[0], x1.shape[1], x1.shape[2], x1.shape[3] + x2.shape[3]), np.uint8)
        # every two element's values are from branch 1
        out[:, :, :, ::2] = x1
        # every two element's values are from branch 2, starting from the 1st element
        out[:, :, :, 1::2] = x2
        end = time.time()
        dur = end - begin
        self.shuffletime += dur
        logging.debug("CONCAT&SHUFFLE: Time Elapsed=%2f." % (dur))
        return out

    def BaseNode(self, x, inp, oup, stride):
        """Runs basic building blocks.
            if stride is 1, then runs:
                concat_and_shuffle(1x1+3x3dw+1x1(split(in)), split(in))
            else runs:
                concat_and_shuffle(3x3dw+1x1(in), (1x1+3x3dw+1x1(in)))
        """
        oup_inc = oup // 2
        if stride == 1:
            y1, y2 = self.channel_split(x)
            y1 = self.conv(y1, oup_inc, oup_inc, stride=1, skip1=0, skip3=0, relu1=1, relu3=0, deform=0)
            y2 = self.conv(y2, oup_inc, oup_inc, stride=1, skip3=1, relu1=1, deform=0)
            y = self.concat_and_shuffle(y1, y2)
        elif stride == 2:
            # bn1:
            y = self.conv(x, inp, inp, skip1=1, stride=2, relu3=0, deform=0)
            y1 = self.conv(y, inp, oup_inc, skip3=1, stride=1, relu1=1, deform=0)
            # bn2:
            y = self.conv(x, inp, oup_inc, stride=2, relu3=0, relu1=1, deform=0)
            y2 = self.conv(y, oup_inc, oup_inc, stride=1, skip3=1, relu1=1, deform=0)
            y = self.concat_and_shuffle(y1, y2)
        return y

    def HeadConv(self, x, ic, oc):
        """Runs to generate the labels for class, center, or box."""
        x = self.conv(x, ic, oc, stride=1, relu1=1, relu3=0, deform=0)
        return x

    def deform_conv(self, x, ic, oc):
        """Runs deformable convolution."""
        # first generates the offset with the 1x1 conv
        scale = self.conv(x, ic, 1, stride=1, skip3=1, skip1=0, relu1=0, deform=0)

        # then runs a 3x3 dw conv with the offset
        x = self.conv(x, ic, ic, stride=1, skip3=0, skip1=1, relu3=0, deform=1)

        if ic != oc:
            # runs a 1x1 conv to change the channel size if needed
            conv_channel = self.conv(x, ic, oc, stride=1, skip3=1, relu1=0, deform=0)
            return conv_channel
        else:
            return x

    def CoDeNet(self, x, w2=False):
        """"""
        if w2:
            self.channels = [32, 256, 512, 1024, 2048]
            deconv_planes = [2048, 256, 128]
        else:
            self.channels = [32, 128, 256, 512, 1024]
            deconv_planes = [1024, 256, 128]
        stage_repeats = [3, 7, 3]
        num_filters = [256, 128, 128]

        heads = [80, 2, 2]

        for idx in range(len(stage_repeats)):
            x = self.BaseNode(x, self.channels[idx], self.channels[idx + 1], 2)
            for _ in range(stage_repeats[idx]):
                x = self.BaseNode(x, self.channels[idx], self.channels[idx + 1], 1)
        x = self.conv(x, self.channels[3], self.channels[4], stride=1, skip3=1, skip1=0, deform=0, relu1=1)

        for i in range(len(num_filters)):
            x = self.deform_conv(x, deconv_planes[i], num_filters[i])
            x = self.upsample2(x)

        ret = []
        for head in heads:
            out = self.HeadConv(x, 128, head)
            ret.append(ret)
        self.get_stats()
        return ret

## 1. Example Code
Code to invoke the accelerator with input size of batch size=1, height=64, width=64, input channel size=32

In [4]:
accel = CoDeNetAccel(MAX_D=128, MAX_IC=1024, MAX_OC=1024, PA=16, PE=16, bitfile="batch.bit")
x = np.random.randint(256, size=(1,64,64,32))
y = accel.CoDeNet(x, w2=False)
accel.print_stats()


	CoDeNet Stats:
		conv: 0.019765 s
		up2: 0.000563 s:
		chansplit: 0.000125 s
		chanshuffle: 0.004843 s
		gops: 28.991496 OP/s
		peak_gops: 77.498617 OP/s
		accel_latency: 0.025296 s
		framerate: 39.532357 fps


# 2. Experiment Configurations and Results
This part runs the CoDeNet on the accelerator and reports the latency of running the accelerator and running 1st layer and other auxiliary functions on CPU. 
We have verified the deformable conv kernel correctness in Vivado HLS.

In [5]:
class ExpConfig:
    """CoDeNet Accelerator Configurations."""

    def __init__(self, name, in_size, w2, first_latency):
        """
        Initializes config. 
        The first layer is run on CPU due to 8-bit weights.
        We obtain the first layer latency using TVM (see ./sw/tvm/README.md). 
            Args:
                name: config name corresponding to table 3&5 in the paper.
                in_size: input size to the accelerator pipeline.
                w2: whether to double the channel size in the network to improve accuracy.
                first_latency: is the first layer latency we obtain from the TVM runs. 
        """
        self.name = name
        self.in_size = in_size
        self.w2 = w2
        self.first_latency = first_latency 
        self.transpose_input()
        
    def transpose_input(self):
        """Time the transpose overhead of each image from nchw to nhwc layout. """
        orig_layout = (self.in_size[0], self.in_size[3], self.in_size[1], self.in_size[2])
        x = np.random.randint(256, size=orig_layout)
        start = time.time()
        y = x.transpose((0, 2, 3, 1))
        end = time.time()
        dur = end - start 
        self.first_latency += dur * 1000

In [6]:
# Since we quantize the first layer to 8-bit weights, we need to run it on the ARM CPU. 
# Please see ./sw/tvm/README.md on how to run the first conv+pooling layer on TVM. 
configs = [
    ExpConfig(name='config a', in_size=(1, 64, 64, 32), w2=False, first_latency=1.8),
    ExpConfig(name='config b', in_size=(1, 64, 64, 32), w2=False, first_latency=7.9),
    ExpConfig(name='config c', in_size=(1, 128, 128, 32), w2=False, first_latency=5.0),
    ExpConfig(name='config d', in_size=(1, 128, 128, 32), w2=True, first_latency=5.0),
    ExpConfig(name='config e', in_size=(1, 128, 128, 32), w2=True, first_latency=30.4),
]

print("Run Experiements: ")
for config in configs: 
    accel = CoDeNetAccel(MAX_D=128, MAX_IC=1024, MAX_OC=1024, PA=16, PE=16, bitfile="batch.bit")
    # init input with random int
    x = np.random.randint(256, size=config.in_size)
    y = accel.CoDeNet(x, w2=config.w2)
    accel.print_stats()
    dur = config.first_latency + accel.totaltime * 1000 
    print("\t%s takes %1f ms to finish."%(config.name, dur))

Run Experiements: 

	CoDeNet Stats:
		conv: 0.019726 s
		up2: 0.000563 s:
		chansplit: 0.000123 s
		chanshuffle: 0.004858 s
		gops: 29.048963 OP/s
		peak_gops: 78.014129 OP/s
		accel_latency: 0.025271 s
		framerate: 39.571519 fps
	config a takes 27.114093 ms to finish.

	CoDeNet Stats:
		conv: 0.019799 s
		up2: 0.000521 s:
		chansplit: 0.000124 s
		chanshuffle: 0.004867 s
		gops: 28.941224 OP/s
		peak_gops: 77.712583 OP/s
		accel_latency: 0.025311 s
		framerate: 39.508153 fps
	config b takes 33.241988 ms to finish.

	CoDeNet Stats:
		conv: 0.065977 s
		up2: 0.003505 s:
		chansplit: 0.000142 s
		chanshuffle: 0.020069 s
		gops: 34.740178 OP/s
		peak_gops: 94.772719 OP/s
		accel_latency: 0.089693 s
		framerate: 11.149163 fps
	config c takes 94.741707 ms to finish.

	CoDeNet Stats:
		conv: 0.124250 s
		up2: 0.002711 s:
		chansplit: 0.000179 s
		chanshuffle: 0.040452 s
		gops: 53.451984 OP/s
		peak_gops: 100.887088 OP/s
		accel_latency: 0.167592 s
		framerate: 5.966862 fps
	config d takes 1