In [1]:
import cffi
import os
import numpy as np
from pynq import Overlay, Xlnk
import pynq
import time 
import math

In [2]:
from scipy import signal

In [16]:
class Convlayer:
    def __init__(self, MAX_D, MAX_IC, MAX_OC, PA, PE, bitfile):
        self.MAX_D = MAX_D
        self.MAX_IC = MAX_IC
        self.MAX_OC = MAX_OC
        self.PA = PA
        self.PE = PE
        ROOT_DIR = 'bitfile'
        self.bitstream_name = os.path.join(ROOT_DIR, bitfile)
        self.overlay = Overlay(self.bitstream_name)
        self.accel = self.overlay.top_0
        self.overlay.download()
        self.runtime = 0
        self.layercount=0
        self.vectoraddtime=0
        self.uptime=0
        self.chansplit=0
        self.convtime=0
        self.collect=[]
        self.vec_collect=[]
        self.up_collect=[]
        self.gops=0
        self.peak_gops=0
        self.shuffle_time = 0
        self.batch = 0 
    
    def showresult(self):
        total = self.convtime+self.vectoraddtime+self.uptime+self.chansplit+self.shuffle_time
        print("convtime:",self.convtime)
        print("vectoraddtime:",self.vectoraddtime)
        print("uptime:",self.uptime)
        print("chansplit:",self.chansplit)
        print("channel_shuffle:", self.shuffle_time)
        print("total:",total)
        print("gops:",self.gops/self.convtime)
        print("peak_gops",self.peak_gops)
        print("framerate: ", 1/total)
    
    def get_pynq_buffer(self,shape, dtype, cacheable=0):
        return Xlnk().cma_array(shape, dtype,cacheable=cacheable)
    
    def ceil(self, base, num):
        res = num % base 
        if res != 0: 
            return num + base - res
        else: 
            return num
        
    def pack_array(self, input, radix): #pack array with radix into uint8
        size=len(input)
        time=8//radix
        size=size//time
        result=[]
        for i in range(size):
            a=0
            for j in range(time):
                a=a|(input[i*time+j]<<(radix*j))
            result.append(a)
        return result

    def unpack_array(self, input, radix): #pack array with radix into uint8
        size=len(input)
        time=8//radix
        mask=(2**radix)-1
        result=[]
        for i in range(size):
            a=input[i]
            for j in range(time):
                b=(a>>(radix*j))&mask
                result.append(b)
        return result   
    
#     def vec_add(self, FM_D, C):
#         a=np.random.randint(low=0,high=16,size=self.batch*FM_D*FM_D*C,dtype=np.uint8)
#         b=np.random.randint(low=0,high=16,size=self.batch*FM_D*FM_D*C,dtype=np.uint8)
#         begin = time.time()
#         c= np.add(a,b)
#         end = time.time()
#         dur = end-begin
#         print("VECTOR ADD Time Elapsed {}".format(dur))
#         self.vectoraddtime += dur
#         self.vec_collect.append([FM_D,C,dur])
#         return dur
    
    def conv(self, x, IC, OC, stride, skip3=0,skip1=0,deform=0,relu1=1,relu3=1):
        T_flag = 0
        if skip1==0:
            for i in range(4):
                if OC > 512:
                    oc = 512
                    OC -= 512
                    y1 = self.conv0(x, IC, oc, stride, skip3=0,skip1=0,deform=0,relu1=1,relu3=1)
                    if T_flag:
                        y = np.concatenate((y, y1), axis=3)
                    else:
                        y = y1
                    T_flag = 1
                else:
                    y2 = self.conv0(x, IC, OC, stride, skip3=0,skip1=0,deform=0,relu1=1,relu3=1)
                    if T_flag:
                        y = np.concatenate((y, y2), axis=3)
                    else:
                        y = y2
                    break
            return y
        
        else:
            for i in range(4):
                if OC > 512:
                    oc = 512
                    ic = 512
                    OC -= 512
                    IC -= 512
                    y1 = self.conv0(x[:,:,:,0:512], ic, oc, stride, skip3=0,skip1=0,deform=0,relu1=1,relu3=1)
                    if T_flag:
                        y = np.concatenate((y, y1), axis=3)
                    else:
                        y = y1
                    T_flag = 1
                else:
                    if T_flag == 1:
                        x = x[:,:,:,512:1024]
                        
                    y2 = self.conv0(x, IC, OC, stride, skip3=0,skip1=0,deform=0,relu1=1,relu3=1)
                    
                    if T_flag:
                        y = np.concatenate((y, y2), axis=3)
                    else:
                        y = y2
                    break

            return y
    
    def conv0(self, x, IC, OC, stride, skip3=0,skip1=0,deform=0,relu1=1,relu3=1):
#------------------------------------------------------------------------------------------------------------------
#     0. batch: currently we do not support batch > 1
#     1. skip3 = 1 if skip 3x3 convolution
#     2. skip1 = 1 if skip 1x1 convolution
#     3. stride = 1 or 2  Note:if stride=2, then skip3 should be 0 
#     4. deform = 1 when we need deformable conv 
#     5. relu = 1 when using relu activation; relu = 0 means using HardTahn activation(from quantization)
#------------------------------------------------------------------------------------------------------------------
        batch = x.shape[0]
        FM_D = x.shape[1]  
        TOTAL_IC = self.ceil(self.PA, IC)
        TOTAL_OC = self.ceil(self.PE, OC)
        assert(TOTAL_IC == x.shape[3])  
        print(FM_D * TOTAL_OC)
        
        if stride == 2:
            y = np.zeros((batch, FM_D//2, FM_D//2, TOTAL_OC),dtype=np.int8)
        else:
            y = np.zeros((batch, FM_D, FM_D, TOTAL_OC),dtype=np.int8)
        
        in_size = batch * TOTAL_IC * FM_D * FM_D // 2
        weight_size3 = 9 * TOTAL_IC  // 2
        addr_size =    FM_D*FM_D
        out_size = batch * TOTAL_OC *  FM_D * FM_D //2
        weight_size1_1 =  TOTAL_IC * TOTAL_OC // 2
        
        
        in_fmap = self.get_pynq_buffer(shape=(in_size,), dtype=np.uint8,cacheable=1)        # divided by 2 -> 4 bit data
        addr = self.get_pynq_buffer(shape=(addr_size,), dtype=np.uint8)
        weight3 = self.get_pynq_buffer(shape=(weight_size3,), dtype=np.uint8)
        weight1_1 = self.get_pynq_buffer(shape=(weight_size1_1,), dtype=np.uint8)
        out_fmap = self.get_pynq_buffer(shape=(out_size,), dtype=np.uint8,cacheable=1) 
        quant = self.get_pynq_buffer(shape=(4 * TOTAL_OC,), dtype=np.int16,cacheable=1)
        
        
        in_fmap_addr  = in_fmap.physical_address
        addr_addr  = addr.physical_address
        weight_addr3  = weight3.physical_address
        weight_addr1_1  = weight1_1.physical_address
        out_fmap_addr  = out_fmap.physical_address
        quant_addr = quant.physical_address

        STRIDE = 1 if stride==2 else 0
        
        
        self.accel.write(0x00, 0x00)
        ctrl_val = self.accel.read(0x00)
        ready = not (ctrl_val & 0x1)
        
        if(ready):
            #print(ready)
            self.accel.write(0x10, in_fmap_addr) 
            self.accel.write(0x18, out_fmap_addr) 
            self.accel.write(0x20, weight_addr1_1)
            self.accel.write(0x28, weight_addr3)
            self.accel.write(0x30, quant_addr)
            self.accel.write(0x38, addr_addr) 
            self.accel.write(0x40, FM_D) 
            self.accel.write(0x48, TOTAL_IC) 
            self.accel.write(0x50, TOTAL_OC) 
            self.accel.write(0x58, batch)
            self.accel.write(0x60, STRIDE)
            self.accel.write(0x68, skip3)
            self.accel.write(0x70, skip1)
            self.accel.write(0x78, deform)
            self.accel.write(0x80, relu1)
            self.accel.write(0x88, relu3)

        begin = time.time()
        self.accel.write(0x00, 0x1) # Start the accel 
        while not (self.accel.read(0x0) & 0x2):
            pass
        
        
        end = time.time()
        dur = end-begin

        
        op3 =batch * (1-skip3) * 9 * FM_D * FM_D  * TOTAL_IC * 2 / 1000000000 / stride / stride 
        op1 =batch * (1-skip1) * FM_D * FM_D  * TOTAL_OC * TOTAL_IC * 2 / 1000000000 / stride / stride
        ops = op1 + op3 
        self.gops += ops
        ops=ops/dur
        
        if ops>self.peak_gops:
            self.peak_gops=ops
        
        print("conv: FM_D: %d\tIC: %d\t OC: %d\t stride:%d skip3:%d skip1:%d deform:%d Time Elapsed :%f GOPS:%f "%(FM_D,IC,OC,stride,skip3,skip1,deform,dur,ops))
        self.convtime+=dur
        self.layercount+=1
        self.collect.append([FM_D,IC,OC,stride,skip3,dur,ops])
        return y
    
    def upsample2(self,x):
        fmap_in = x
        begin = time.time()
        fmap_out = fmap_in.repeat(2,1).repeat(2,2)
        end = time.time()
        dur = end-begin
        self.uptime+=dur
        self.up_collect.append([x.shape[1],x.shape[3],dur])
        return fmap_out
        print("up2 FM_D:%d FM_C%d time:%f"%(x.shape[1],x.shape[3],dur))
    
    def channel_split(self,x):
        begin = time.time()
        y1 = x[:,:,:,::2]
        y2 = x[:,:,:,1::2]
        #print("out",fmap_out2)
        end = time.time()
        dur = end-begin
        self.chansplit+=dur
        print("channel_split FM_D:%d FM_C%d time:%f"%(x.shape[1],x.shape[2],dur))
        return y1, y2
        
    def concat(self,x1,x2):
        begin = time.time()
        y = np.concatenate((x1, x2), axis=3)
        end = time.time()
        dur = end-begin
        dur *= self.batch
        self.vectoraddtime += dur
        print("Time Elapsed %f"%(dur))
        return y
    
    def channel_shuffle(self, x, G=2):
        begin = time.time()
        B, H, W, C = x.shape
        x = x.reshape(B, H, W, G, C // G)
        x = np.transpose(x, (0, 1, 2, 4, 3))
        x = x.reshape(B, H, W, C)
        end = time.time()
        dur = end-begin
        self.shuffle_time += dur
        print("channel_shuffle: %f"%(dur))
        return x
    
    def concat_and_shuffle(self, x1, x2):
        begin = time.time()
        out = np.zeros((x1.shape[0], x1.shape[1],x1.shape[2], x1.shape[3] + x2.shape[3]),np.uint8)
        out[:,:,:,::2] = x1
        out[:,:,:,1::2] = x2
        end = time.time()
        dur = end-begin
        self.shuffle_time += dur
        print("concat and shuffle: %f"%(dur))
        return out
 
    #https://github.com/Zhen-Dong/EfficientDeformable/blob/998de80e196a9578a0ec5cb1d54a37b502bfd695/lib/models/external/modules/dcn_deform_conv.py#L349
    
    def BaseNode(self, x, inp, oup, stride):
        oup_inc = oup // 2
        if stride == 1:
            y1, y2 = self.channel_split(x)
            print(y1.shape, y2.shape)
            y2 = self.conv(y2, oup_inc, oup_inc, stride=1, skip1=0, skip3=0, relu1=1, relu3=0, deform=0)
            y2 = self.conv(y2, oup_inc, oup_inc, stride=1, skip3=1, relu1=1, deform=0)
            y = self.concat_and_shuffle(y1, y2)
        elif stride == 2:
            #bn1:
            y = self.conv(x, inp, inp, skip1=1, stride=2, relu3=0, deform=0)
            y1 = self.conv(y, inp, oup_inc, skip3=1, stride=1, relu1=1, deform=0)
            #bn2:
            y = self.conv(x, inp, oup_inc, stride=2, relu3=0, relu1=1, deform=0)
            y2 = self.conv(y, oup_inc, oup_inc, stride=1, skip3=1, relu1=1, deform=0)
            y = self.concat_and_shuffle(y1, y2)
#             y = self.concat(y1,y2)
#             y = self.channel_shuffle(y)
        return y
    

    
    def HeadConv(self,x,inp, oup, head_conv=64):
        x = self.conv(x, 128, oup, stride=1, relu1=1,relu3=0, deform=0)
        #x = self.conv(x, head_conv, oup, stride=1, skip3=1, relu1=1, deform=0) 
        return x
    
    
    def deform_conv(self, x, ic, oc):
        scale = self.conv(x, ic, 1,stride=1, skip3=1, skip1=0, relu1=0, deform=0)
        
        #conv_deform:
        x = self.conv(x, ic, ic, stride=1, skip3=0, skip1=1, relu3=0, deform=1)
        
        if ic != oc:
            conv_channel = self.conv(x, ic, oc, stride=1, skip3=1, relu1=0, deform=0)
            return conv_channel
        else:
            return x

                 
    
    
    def PoseSufflenetv2(self, x, w2=False):
        #self.conv(512,3,24,stride=2,deform=0)
        #self.layer0 = nn.Sequential(nn.Conv2d(3, self.channels[0], 3, 4, 1, bias=False)
        #self.channels = [64, 128, 256, 512]
        if w2:
            self.channels = [32, 256, 512, 1024, 2048]
            deconv_planes = [2048, 256, 128]
        else:
            self.channels = [32, 128, 256, 512, 1024]
            deconv_planes = [1024, 256, 128]
        stage_repeats = [3, 7, 3]
        num_filters = [256, 128, 128]
        
        heads = [80, 2, 2]
        
        for idx in range(len(stage_repeats)):
            x = self.BaseNode(x,self.channels[idx], self.channels[idx+1], 2)
            for _ in range(stage_repeats[idx]):
                x = self.BaseNode(x,self.channels[idx], self.channels[idx+1],1)
        x = self.conv(x, self.channels[3], self.channels[4], stride=1,skip3=1,skip1=0,deform=0,relu1=1)
        
        for i in range(len(num_filters)):
            x = self.deform_conv(x, deconv_planes[i], num_filters[i])
            x = self.upsample2(x)
            #???
        ret = []
        for head in heads:
            out = self.HeadConv(x, 128, head)
            ret.append(ret)
        return ret
    



        



In [17]:
#convlayer = Convlayer(MAX_D=512, MAX_IC=1024, MAX_OC=1024, PA=16, PE=16, bitfile="compare_13.bit")
convlayer = Convlayer(MAX_D=512, MAX_IC=1024, MAX_OC=1024, PA=16, PE=16, bitfile="batch.bit")

In [18]:
x = np.zeros((1,64,64,32),dtype=np.uint8)
y = convlayer.PoseSufflenetv2(x, w2=True)
convlayer.showresult()

2048
conv: FM_D: 64	IC: 32	 OC: 32	 stride:2 skip3:0 skip1:0 deform:0 Time Elapsed :0.001028 GOPS:2.614848 
4096
conv: FM_D: 32	IC: 32	 OC: 128	 stride:1 skip3:0 skip1:0 deform:0 Time Elapsed :0.000609 GOPS:14.733284 
8192
conv: FM_D: 64	IC: 32	 OC: 128	 stride:2 skip3:0 skip1:0 deform:0 Time Elapsed :0.001712 GOPS:5.245615 
4096
conv: FM_D: 32	IC: 128	 OC: 128	 stride:1 skip3:0 skip1:0 deform:0 Time Elapsed :0.000740 GOPS:48.528703 
concat and shuffle: 0.001345
channel_split FM_D:32 FM_C32 time:0.000013
(1, 32, 32, 128) (1, 32, 32, 128)
4096
conv: FM_D: 32	IC: 128	 OC: 128	 stride:1 skip3:0 skip1:0 deform:0 Time Elapsed :0.000738 GOPS:48.638390 
4096
conv: FM_D: 32	IC: 128	 OC: 128	 stride:1 skip3:0 skip1:0 deform:0 Time Elapsed :0.000740 GOPS:48.513073 
concat and shuffle: 0.001287
channel_split FM_D:32 FM_C32 time:0.000014
(1, 32, 32, 128) (1, 32, 32, 128)
4096
conv: FM_D: 32	IC: 128	 OC: 128	 stride:1 skip3:0 skip1:0 deform:0 Time Elapsed :0.000739 GOPS:48.622690 
4096
conv: FM_D: 

In [None]:
# x = np.zeros((128,128,256),dtype=np.uint8)
# y = convlayer.conv(x, 256, 256, stride=1, batch=1,skip3=0,skip1=0,deform=0,relu1=1,relu3=1)
# x = np.zeros((128,128,512),dtype=np.uint8)
# y = convlayer.conv(x, 512, 512, stride=1, batch=1,skip3=0,skip1=0,deform=0,relu1=1,relu3=1)

convtime: 0.16585612297058105
vectoraddtime: 0.0
uptime: 0.0032160282135009766
chansplit: 0.0003142356872558594
channel_shuffle: 0.004062652587890625
total: 0.17344903945922852
gops: 17.489583019582128
peak_gops 43.1627336339898
framerate:  5.765382172871953


In [None]:
# convlayer.collect
# collect = np.array(convlayer.collect)
# np.savetxt("collect.csv", collect,fmt='%i,%i,%i,%i,%i,%1.4e,%1.4e',)
# np.savetxt("up.csv",np.array(convlayer.up_collect),fmt='%i,%i,%1.4e')
# np.savetxt("vecadd.csv",np.array(convlayer.vec_collect),fmt='%i,%i,%1.4e')

In [None]:
# a=[1,2,3,4,5,6,7,8]
# convlayer.unpack_array(convlayer.pack_array(a,4),4)