In [None]:
from pynq import Overlay
from pynq import allocate

In [1]:
import numpy as np
from time import time
import matplotlib.pyplot as plt 

In [None]:
# validate data
def Validate(ourOutput, golden, size):
    errors = 0
    N, C, D, H, W = size[0], size[1], size[2], size[3], size[4]
    for n in range(N):
        for c in range(C):
            for d in range(D):
                for h in range(H):
                    for w in range(W):
                        pos = n * C*D*H*W + c * D*H*W + d * H*W + h * W + w
                        if ourOutput[pos] != golden[pos]:
                            print(f'[ERROR]  result[{n+1:3.0f}][{c+1:3.0f}][{d+1:3.0f}][{h+1:3.0f}][{w+1:3.0f}] = {ourOutput[pos]:3.0f}, gold: {golden[pos]:3.0f}, error: {100*(ourOutput[pos] - golden[pos]) / golden[pos]:3.5f}%')
                            errors += 1
    return errors

def Validate_file(ourOutput, golden, size, fp):
    errors = 0
    N, C, D, H, W = size[0], size[1], size[2], size[3], size[4]
    for n in range(N):
        for c in range(C):
            for d in range(D):
                for h in range(H):
                    for w in range(W):
                        pos = n * C*D*H*W + c * D*H*W + d * H*W + h * W + w
                        if ourOutput[pos] != golden[pos]:
                            print(f'[ERROR]  result[{n+1:3.0f}][{c+1:3.0f}][{d+1:3.0f}][{h+1:3.0f}][{w+1:3.0f}] = {ourOutput[pos]:3.0f}, gold: {golden[pos]:3.0f}, error: {100*(ourOutput[pos] - golden[pos]) / golden[pos]:3.5f}%', file=fp)
                            errors += 1
    return errors


In [None]:
def CountArr(filename):
    """
    Count the number of data in the file "filename"
    """
    with open(filename) as f:
        num = 0
        line = f.readline()
        while line:
            num += 1
            line = f.readline()
    return num

def LoadArr(filename, type):
    """
    Load the data array from the file "filename" into a PL numpy array of type "type"
    """
    num = CountArr(filename)
    arr = allocate(shape=(num,), dtype=type)
    with open(filename) as f:
        line = f.readline()
        num = 0
        while line:
            arr[num] = type(np.float32(line))
            num += 1
            line = f.readline()
    return arr



In [None]:
# allocate buffers
output = allocate(shape=(10,), dtype=np.uint8)

X_stem_1 = allocate(shape=(2257920,), dtype=np.uint8)
X_stem_2 = allocate(shape=(3211264,), dtype=np.uint8)
X_seq = allocate(shape=(50176,), dtype=np.uint8)
X_adap = allocate(shape=(512,), dtype=np.uint8)
X_tmp_data = allocate(shape=(3211264,), dtype=np.uint8)
X2_data = allocate(shape=(802816,), dtype=np.uint8)
X2_tmp_data = allocate(shape=(802816,), dtype=np.uint8)
X_mid_data = allocate(shape=(7225344,), dtype=np.uint8)
X_batch_data = allocate(shape=(7225344,), dtype=np.uint8)

In [None]:
# allocate data

Kernel_stem_0 = LoadArr('stem.0.weight.dat', np.int8)
Kernel_stem_3 = LoadArr('stem.3.weight.dat', np.int8)

Kernel_seq1_0_conv1_0_0 = LoadArr('layer1.0.conv1.0.0.weight.dat', np.int8)
Kernel_seq1_0_conv2_0_0 = LoadArr('layer1.0.conv2.0.0.weight.dat', np.int8)
Kernel_seq1_1_conv1_0_0 = LoadArr('layer1.1.conv1.0.0.weight.dat', np.int8)
Kernel_seq1_1_conv2_0_0 = LoadArr('layer1.1.conv2.0.0.weight.dat', np.int8)
Kernel_seq1_0_conv1_0_3 = LoadArr('layer1.0.conv1.0.3.weight.dat', np.int8)
Kernel_seq1_0_conv2_0_3 = LoadArr('layer1.0.conv2.0.3.weight.dat', np.int8)
Kernel_seq1_1_conv1_0_3 = LoadArr('layer1.1.conv1.0.3.weight.dat', np.int8)
Kernel_seq1_1_conv2_0_3 = LoadArr('layer1.1.conv2.0.3.weight.dat', np.int8)

Kernel_seq2_0_conv1_0_0 = LoadArr('layer2.0.conv1.0.0.weight.dat', np.int8)
Kernel_seq2_0_conv1_0_3 = LoadArr('layer2.0.conv1.0.3.weight.dat', np.int8)
Kernel_seq2_0_conv2_0_0 = LoadArr('layer2.0.conv2.0.0.weight.dat', np.int8)
Kernel_seq2_0_conv2_0_3 = LoadArr('layer2.0.conv2.0.3.weight.dat', np.int8)
Kernel_seq2_0_downsample_0 = LoadArr('layer2.0.downsample.0.weight.dat', np.int8)
Kernel_seq2_1_conv1_0_0 = LoadArr('layer2.1.conv1.0.0.weight.dat', np.int8)
Kernel_seq2_1_conv1_0_3 = LoadArr('layer2.1.conv1.0.3.weight.dat', np.int8)
Kernel_seq2_1_conv2_0_0 = LoadArr('layer2.1.conv2.0.0.weight.dat', np.int8)
Kernel_seq2_1_conv2_0_3 = LoadArr('layer2.1.conv2.0.3.weight.dat', np.int8)

Kernel_seq3_0_conv1_0_0 = LoadArr('layer3.0.conv1.0.0.weight.dat', np.int8)
Kernel_seq3_0_conv1_0_3 = LoadArr('layer3.0.conv1.0.3.weight.dat', np.int8)
Kernel_seq3_0_conv2_0_0 = LoadArr('layer3.0.conv2.0.0.weight.dat', np.int8)
Kernel_seq3_0_conv2_0_3 = LoadArr('layer3.0.conv2.0.3.weight.dat', np.int8)
Kernel_seq3_0_downsample_0 = LoadArr('layer3.0.downsample.0.weight.dat', np.int8)
Kernel_seq3_1_conv1_0_0 = LoadArr('layer3.1.conv1.0.0.weight.dat', np.int8)
Kernel_seq3_1_conv1_0_3 = LoadArr('layer3.1.conv1.0.3.weight.dat', np.int8)
Kernel_seq3_1_conv2_0_0 = LoadArr('layer3.1.conv2.0.0.weight.dat', np.int8)
Kernel_seq3_1_conv2_0_3 = LoadArr('layer3.1.conv2.0.3.weight.dat', np.int8)

Kernel_seq4_0_conv1_0_0 = LoadArr('layer4.0.conv1.0.0.weight.dat', np.int8)
Kernel_seq4_0_conv1_0_3 = LoadArr('layer4.0.conv1.0.3.weight.dat', np.int8)
Kernel_seq4_0_conv2_0_0 = LoadArr('layer4.0.conv2.0.0.weight.dat', np.int8)
Kernel_seq4_0_conv2_0_3 = LoadArr('layer4.0.conv2.0.3.weight.dat', np.int8)
Kernel_seq4_0_downsample_0 = LoadArr('layer4.0.downsample.0.weight.dat', np.int8)
Kernel_seq4_1_conv1_0_0 = LoadArr('layer4.1.conv1.0.0.weight.dat', np.int8)
Kernel_seq4_1_conv1_0_3 = LoadArr('layer4.1.conv1.0.3.weight.dat', np.int8)
Kernel_seq4_1_conv2_0_0 = LoadArr('layer4.1.conv2.0.0.weight.dat', np.int8)
Kernel_seq4_1_conv2_0_3 = LoadArr('layer4.1.conv2.0.3.weight.dat', np.int8)

Kernel_linear = LoadArr('fc.1.weight.dat', np.int8)

In [None]:
# load input and output data
my_input = LoadArr('input.dat', np.uint8)
output_golden = LoadArr('output.dat', np.uint8)

In [None]:
# run the model
# load ip
ol = Overlay("R2plus1d_v1.bit")
ip_r2plus1d = ol.r2plus1d_0

# write input address
ip_r2plus1d.write(0x10, my_input.device_address)
# write output address
ip_r2plus1d.write(0x1C, output.device_address)
# write kernel address
ip_r2plus1d.write(0x28, Kernel_stem_0.device_address)
ip_r2plus1d.write(0x34, Kernel_stem_3.device_address)
ip_r2plus1d.write(0x40, Kernel_seq1_0_conv1_0_0.device_address)
ip_r2plus1d.write(0x4C, Kernel_seq1_0_conv1_0_3.device_address)
ip_r2plus1d.write(0x58, Kernel_seq1_0_conv2_0_0.device_address)
ip_r2plus1d.write(0x64, Kernel_seq1_0_conv2_0_3.device_address)
ip_r2plus1d.write(0x70, Kernel_seq1_1_conv1_0_0.device_address)
ip_r2plus1d.write(0x7C, Kernel_seq1_1_conv1_0_3.device_address)
ip_r2plus1d.write(0x88, Kernel_seq1_1_conv2_0_0.device_address)
ip_r2plus1d.write(0x94, Kernel_seq1_1_conv2_0_3.device_address)
ip_r2plus1d.write(0xA0, Kernel_seq2_0_conv1_0_0.device_address)
ip_r2plus1d.write(0xAC, Kernel_seq2_0_conv1_0_3.device_address)
ip_r2plus1d.write(0xB8, Kernel_seq2_0_conv2_0_0.device_address)
ip_r2plus1d.write(0xC4, Kernel_seq2_0_conv2_0_3.device_address)
ip_r2plus1d.write(0xD0, Kernel_seq2_0_downsample_0.device_address)
ip_r2plus1d.write(0xDC, Kernel_seq2_1_conv1_0_0.device_address)
ip_r2plus1d.write(0xE8, Kernel_seq2_1_conv1_0_3.device_address)
ip_r2plus1d.write(0xF4, Kernel_seq2_1_conv2_0_0.device_address)
ip_r2plus1d.write(0x100, Kernel_seq2_1_conv2_0_3.device_address)
ip_r2plus1d.write(0x10C, Kernel_seq3_0_conv1_0_0.device_address)
ip_r2plus1d.write(0x118, Kernel_seq3_0_conv1_0_3.device_address)
ip_r2plus1d.write(0x124, Kernel_seq3_0_conv2_0_0.device_address)
ip_r2plus1d.write(0x130, Kernel_seq3_0_conv2_0_3.device_address)
ip_r2plus1d.write(0x13C, Kernel_seq3_0_downsample_0.device_address)
ip_r2plus1d.write(0x148, Kernel_seq3_1_conv1_0_0.device_address)
ip_r2plus1d.write(0x154, Kernel_seq3_1_conv1_0_3.device_address)
ip_r2plus1d.write(0x160, Kernel_seq3_1_conv2_0_0.device_address)
ip_r2plus1d.write(0x16C, Kernel_seq3_1_conv2_0_3.device_address)
ip_r2plus1d.write(0x178, Kernel_seq4_0_conv1_0_0.device_address)
ip_r2plus1d.write(0x184, Kernel_seq4_0_conv1_0_3.device_address)
ip_r2plus1d.write(0x190, Kernel_seq4_0_conv2_0_0.device_address)
ip_r2plus1d.write(0x19C, Kernel_seq4_0_conv2_0_3.device_address)
ip_r2plus1d.write(0x1A8, Kernel_seq4_0_downsample_0.device_address)
ip_r2plus1d.write(0x1B4, Kernel_seq4_1_conv1_0_0.device_address)
ip_r2plus1d.write(0x1C0, Kernel_seq4_1_conv1_0_3.device_address)
ip_r2plus1d.write(0x1CC, Kernel_seq4_1_conv2_0_0.device_address)
ip_r2plus1d.write(0x1D8, Kernel_seq4_1_conv2_0_3.device_address)
ip_r2plus1d.write(0x1E4, Kernel_linear.device_address)
# write buffer address
ip_r2plus1d.write(0x1F0, X_stem_1.device_address)
ip_r2plus1d.write(0x1FC, X_stem_2.device_address)
ip_r2plus1d.write(0x208, X_seq.device_address)
ip_r2plus1d.write(0x214, X_adap.device_address)
ip_r2plus1d.write(0x220, X_tmp_data.device_address)
ip_r2plus1d.write(0x22C, X2_data.device_address)
ip_r2plus1d.write(0x238, X2_tmp_data.device_address)
ip_r2plus1d.write(0x244, X_mid_data.device_address)
ip_r2plus1d.write(0x250, X_batch_data.device_address)


In [None]:
# start computation
# start the computation for hls hardware
timeKernelStart = time()
ip_r2plus1d.write(0x00, 0x01)
# wait for the computation to finish
while (ip_r2plus1d.read(0x00) & 0x4) == 0x0:
    continue
timeKernelEnd = time()
print("hardware execution time: " + str(timeKernelEnd - timeKernelStart) + " s")

In [None]:
# validate the result
errors = Validate(output, output_golden, [1, 10, 1, 1, 1])
if errors:
    print("[FAIL] There are some errors QQ, error rate: ", errors / 10)
else:
    print("[PASS] Congratulation! All results are correct")