# Heron Core Testbench for Ultra96

This notebook offers a driver to interface with a (BlockRAM based) Heron core on the Ultra96 board. We'll define a driver to interact with the Heron core and run a sweep over the entire testbench for different GC threshold parameters.

In [None]:
from pynq import Overlay
from pynq import allocate
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import sys

Load binary versions for each benchmark and their expected results. These are generated by the `dumpTestsuites` function in the `heron-clash` module. These examples are for an architecture with `-r6:4:2:1:2:16` parameters for various heap depths.

In [None]:
import benchmarks.larges_r6_4_2_1_2_16_32k as benches_32k
import benchmarks.larges_r6_4_2_1_2_16_16k as benches_16k
import benchmarks.larges_r6_4_2_1_2_16_12k as benches_12k
import benchmarks.larges_r6_4_2_1_2_16_8k  as benches_8k

## Driver definitions

Next, we define a helper function `packOps` to package a binary program into 64 bit packets, ready for DMA transfer to Heron.

In [None]:
mask = lambda bits, run, offset : (bits & ((1<<run)-1)) << offset

def packOp (data, addr, we, go):
    full =   mask(go  , 1  , 0 ) \
           | mask(we  , 1  , 1 ) \
           | mask(addr, 10 , 2 ) \
           | mask(data, 337, 12)
    words = [ (full >> (i*64)) & ((1<<64)-1) for i in range(512//64)]
    return words

def packOps (tmpls):
    raws   = [ (t, i, 1, 0) for (i,t) in enumerate(tmpls)]
    raws.append((0,1023,0,1))
    raws.append((0,1023,0,0))
    wordss = [packOp (d,a,w,g) for (d,a,w,g) in raws]
    words  = [w for line in wordss for w in line]
    return words

We'll now define a little driver responsible for loading the Heron core into the FPGA, transferring program binaries to it, and reading the result.

In [None]:
from pynq import Clocks
from time import sleep

class HeronOverlay(Overlay):
    def __init__(self, bitfile, heap_size, **kwargs):
        super().__init__(bitfile, **kwargs)
        self.txBuf = allocate(shape=(8196+2,), dtype=np.uint64)
        self.rxBuf = allocate(shape=(4,), dtype=np.uint64)
        self.heap_size = heap_size
    
    def set_gc_threshold(self, x):
        # Scale the percentage threshold by the maximum
        # (Heap depth / 4)
        n = int(self.heap_size*x/100/4)
        for i in range(32):
            if (n >> i) & 1:
                self.gc_threshold.channel1[i].on()
            else:
                self.gc_threshold.channel1[i].off()
        
    def run(self, golden, code):
        # Generate tx buffers from the given template binaries
        ops = code.copy()
        txWords = packOps(ops)
        for i in range(len(txWords)):
            self.txBuf[i] = txWords[i]
        for i in range(len(txWords),ol.txBuf.size):
            self.txBuf[i] = 0

        # Send buffer to Heron
        self.dma_rx.recvchannel.transfer(self.rxBuf)
        self.dma_tx.sendchannel.transfer(self.txBuf)
        
        # Wait for completion
        self.dma_rx.recvchannel.wait()
        self.dma_tx.sendchannel.wait()
        
        # Parse return value and debug stats
        shiftRet = 0 if self.heap_size < 32*1024 else 1
        atomLen  = 19
        ans = int(self.rxBuf[3]) << (128+64) | int(self.rxBuf[2]) << 128 | int(self.rxBuf[1]) << 64 | int(self.rxBuf[0])
        ret = (ans >> shiftRet) & 0b111111111111111
        maxStall = (ans >> atomLen) & 0xFFFFFFFF
        waits = (ans >> (atomLen+  32)) & 0xFFFFFFFF
        roots = (ans >> (atomLen+2*32)) & 0xFFFFFFFF
        muts  = (ans >> (atomLen+3*32)) & 0xFFFFFFFF
        stats = {'ret' : ret, 'roots': roots, 'muts': muts, 'waits': waits, 'maxStall': maxStall}
        total = waits+roots+muts
        
        # Report
        if ret == golden:
            print('Success!')
            print(f'{hex(ans)}')
            print(f'GC Overhead {(waits+roots)/total*100}%')
            print(f'   -> Roots Overhead {roots/total*100}%')
            print(f'   -> Waits Overhead {waits/total*100}%')
            print(f'   -> Max stall {maxStall} cycles')
            return stats
        elif ans & 0x7FFFF == 0x27FFC:
            print(f'Failed after {muts} mutation cycles')
            raise RuntimeError(f'Heap exhausted')
        else:
            print(f'Failed after {muts} mutation cycles')
            raise RuntimeError(f'Test failed! Expected {golden} but got {ret} ({hex(self.rxBuf[0])})')

## Ad hoc testing

We can make an instance of our driver (this actually loads our core onto the FPGA) and see if it responds.

In [None]:
ol = HeronOverlay("./12k/heron_ultra96.bit", 12*1024)
ol.set_gc_threshold(20)

prog='adjoxo'
ol.run(benches_12k.rets[prog],benches_12k.codes[prog])

If everything looks good, let's run through four pre-generated bitstreams with different heap sizes. For each bitstream, we'll run every benchmark program with various GC thresholds and save them to CSVs for further analysis.

In [None]:
for heapK in [8,12,16,32]:
    print(f'Running heap @{heapK}k nodes')
    heapsize = heapK*1024
    ol = HeronOverlay(f'./{heapK}k/heron_ultra96.bit', heapsize)
    codes    = sys.modules[f'benchmarks.larges_r6_4_2_1_2_16_{heapK}k'].codes
    rets     = sys.modules[f'benchmarks.larges_r6_4_2_1_2_16_{heapK}k'].rets
    results  = []

    for thres in range(5,100,5):
        ol.set_gc_threshold(thres)
        print(f'Running threshold @{thres}')
        for bench in codes.keys():
            print(f'Running {bench}')
            stats = ol.run(rets[bench], codes[bench])
            stats['threshold'] =  thres
            stats['heap']      = heapsize
            stats['bench']     = bench
            results.append(stats)

    df = pd.DataFrame(results)
    df.to_csv(f'{heapK}k_sweep.csv')