## Imports and installs

In [1]:
!pip install pandas --user



In [2]:
#%load_ext autoreload
#%autoreload 2

In [3]:
%%html
<style>
.container { width:90% !important; }
.rendered_html pre code {border: 0; background-color: #f4f4f4; display:inline-block;}
.rendered_html code.language-cpp {border: 0; background-color: #f4f4f4; display:inline-block;}
</style>

In [4]:
scan_script_version = "1.0"

In [5]:
import os

import numpy as np

# as of Feb'20 there is a bug that segfaults ONNX shape inference if we
# import pytorch before onnx, so we make sure to import onnx first
import onnx  # NOQA

import pkg_resources as pk
from finn.custom_op.registry import getCustomOp
from finn.core.onnx_exec import execute_onnx
from finn.transformation.double_to_single_float import DoubleToSingleFloat
from finn.transformation.infer_shapes import InferShapes
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from finn.transformation.fold_constants import FoldConstants
from finn.transformation.general import (
    RemoveUnusedTensors,
    RemoveStaticGraphInputs,
    GiveReadableTensorNames,
    GiveUniqueNodeNames,
)
from finn.transformation.streamline import Streamline
from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
import finn.transformation.streamline.absorb as absorb
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
    ReplaceVerilogRelPaths,
)
from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
from finn.util.basic import pynq_part_map
from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO

import brevitas.onnx as bo

from finn.util.visualization import showInNetron
import time
from finn.core.modelwrapper import ModelWrapper

from finn.util.vcd import get_all_stream_if_stats
import finn.util.vcd as vcd
import time
import json
import xml.etree.ElementTree as ET
import pandas as pd

import pkg_resources as pk
import matplotlib.pyplot as plt
import numpy as np
from finn.core.onnx_exec import execute_onnx
from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim
from multiprocessing import Process, Lock

from subprocess import TimeoutExpired
from glob import glob
from brevitas_examples.bnn_pynq.models.CNV import CNV
import torch

# one bit activation stuff
from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount

# new with 0.4
from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
from finn.transformation.infer_datatypes import InferDataTypes
from finn.util.pytorch import ToTensor
from finn.transformation.merge_onnx_models import MergeONNXModels
from finn.core.datatype import DataType
from finn.transformation.insert_topk import InsertTopK
from finn.transformation.streamline.reorder import (
    MakeMaxPoolNHWC,
    MoveScalarLinearPastInvariants,
)
from finn.transformation.infer_data_layouts import InferDataLayouts
from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
from finn.util.test import (
    get_build_env,
    load_test_checkpoint_or_skip,
    get_example_input,
    get_trained_network_and_ishape,
    execute_parent,
    get_topk,
)
from scipy.stats import linregress
from finn.util.test import example_map
import psutil
import shutil

# for autotuning
from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
from finn.analysis.fpgadataflow.res_estimation import res_estimation, res_estimation_complete
import copy

# For gzip
import gzip

## Function definitions

In [6]:
# Could get removed in when upgraded to newer version
import numpy as np
import os
import subprocess
import warnings

from finn.core.rtlsim_exec import rtlsim_exec
from finn.util.basic import gen_finn_dt_tensor


def throughput_test_remote(model, batchsize=1000, timeout=None):
    """Runs the throughput test for the given model remotely on the pynq board.
    The metadata properties related to the pynq board have to be set.
    Additionally a timeout for the SSH communication can be set.
    Returns a dictionary with results of the throughput test. Returns None
    if the test fails."""

    pynq_ip = model.get_metadata_prop("pynq_ip")
    pynq_port = int(model.get_metadata_prop("pynq_port"))
    pynq_username = model.get_metadata_prop("pynq_username")
    pynq_password = model.get_metadata_prop("pynq_password")
    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
    # extracting last folder of absolute path (deployment_dir)
    deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
    platform = model.get_metadata_prop("platform")
    assert platform in ["alveo", "zynq-iodma"]
    bitfile = model.get_metadata_prop("bitfile")
    bitfile = os.path.basename(bitfile)
    if pynq_password == "":
        if "zynq" in platform:
            raise Exception("PYNQ board remote exec needs password for sudo")
        else:
            local_prefix = ""  # assume we are using an ssh key
            warnings.warn("Empty password, make sure you've set up an ssh key")
    else:
        local_prefix = "sshpass -p %s " % pynq_password

    if platform == "alveo":
        # Alveo can run without sudo but needs correct environment
        remote_prefix = "conda activate finn-pynq-alveo; "
    elif "zynq" in platform:
        # PYNQ Zynq boards need to execute with sudo
        remote_prefix = "echo %s | sudo -S " % pynq_password

    # use platform attribute for correct remote execution
    if platform == "alveo":
        remote_cmd = "bash -ic 'bash alveo_run.sh throughput_test %d' \"" % batchsize
    else:
        remote_cmd = (
            "python3.6 driver.py --exec_mode=throughput_test --batchsize={} "
            "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
            '--platform={} "'
        ).format(batchsize, bitfile, platform)
    cmd = (
        local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
    ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
    bash_command = ["/bin/bash", "-c", cmd]
    process_throughput_test = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
    process_throughput_test.communicate(timeout=timeout)

    # remove any pre-existing metrics file
    try:
        os.remove("{}/nw_metrics.txt".format(deployment_dir))
    except FileNotFoundError:
        pass

    cmd = local_prefix + "scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
        pynq_port,
        pynq_username,
        pynq_ip,
        pynq_target_dir,
        deployment_folder,
        deployment_dir,
    )
    bash_command = ["/bin/bash", "-c", cmd]
    process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
    process_compile.communicate(timeout=timeout)

    try:
        with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
            res = eval(file.read())
        return res
    except FileNotFoundError:
        return None

In [7]:
def get_checkpoint_name_generator(build_dir, save_file_prefix, topology, wbits, abits, parameter_priority):
    priority_string = "prio_" + "-".join(parameter_priority)
    def get_checkpoint_name(step, get_full_path=True):
        
        if get_full_path:
            return build_dir + "/end2end_%s_%s_%s_w%da%d_%s.onnx" % (save_file_prefix, priority_string, topology, wbits, abits, step)
        else:
            if step == "":
                return "%s_%s_%s_w%da%d" % (save_file_prefix, priority_string, topology, wbits, abits)
            else:
                return "%s_%s_%s_w%da%d_%s" % (save_file_prefix, priority_string, topology, wbits, abits, step)
            
    return get_checkpoint_name

# custom functions
def get_trained_network_and_ishape(topology, wbits, abits):
    "Return (trained_model, shape) for given BNN-PYNQ test config."

    topology_to_ishape = {
        "tfc": (1, 1, 28, 28),
        "cnv": (1, 3, 32, 32),
    }
    ishape = topology_to_ishape[topology]
    model = get_test_model_trained(topology.upper(), wbits, abits)
    return (model, ishape)

def get_test_model_trained(netname, wbits, abits):
    "get_test_model with pretrained=True"
    return get_test_model(netname, wbits, abits, pretrained=True)

def get_test_model(netname, wbits, abits, pretrained):
    """Returns the model specified by input arguments from the Brevitas BNN-PYNQ
    test networks. Pretrained weights loaded if pretrained is True."""
    if ((wbits > 2) or (abits > 2)) or ((wbits == 2) and (abits == 1)):
        # get own custom trained models
        if netname == "CNV":
            from brevitas_examples.bnn_pynq.models.CNV import CNV
            num_classes = 10
            fc = CNV(num_classes, wbits, abits, 8, 3)
            if pretrained:
                # Load checkpoint from disk
                checkpoint_folder_location = f"../../CIFAR10_networks_only_tar_files/CIFAR{num_classes}_CNV_{wbits}W{abits}A_*"
                folder_list = glob(checkpoint_folder_location)
                checkpoint_folder_location = folder_list[0] + "/checkpoints/"
                print("loading from folder: ", checkpoint_folder_location)

                checkpoint_dict = torch.load(checkpoint_folder_location + "best.tar", map_location=torch.device('cpu'))
                checkpoint_dict.keys(), checkpoint_dict["epoch"]

                # load saved training results
                fc.load_state_dict(checkpoint_dict["state_dict"], strict=True)
        else:
            raise RuntimeError("Custom weights are only supported for the CNV architecture")
    else:
        # get the models in the standard way
        model_cfg = (netname, wbits, abits)
        model_def_fxn = example_map[model_cfg]
        fc = model_def_fxn(pretrained)
    return fc.eval()

def load_bit_model(get_checkpoint_name, weight_bit_width, act_bit_width):
    # get model
    (model, ishape) = get_trained_network_and_ishape("cnv", weight_bit_width, act_bit_width)
    
    # export to onnx
    bo.export_finn_onnx(model, ishape, get_checkpoint_name("export"))
    
    # Hotfix for non-unity multiplilcation in front of MaxPool
    model = ModelWrapper(get_checkpoint_name("export"))
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(GiveReadableTensorNames())
    
    if ((weight_bit_width > 2) or (act_bit_width > 2)) or ((weight_bit_width == 2) and (act_bit_width == 1)):
        # The Tensors, which need adjusting to 1 are: Mul_5_param0, Mul_9_param0
        tensor_names_to_adjust = ["Mul_5_param0", "Mul_9_param0", "Mul_3_param0", "Mul_7_param0", "Mul_11_param0", "Mul_13_param0", "Mul_15_param0", "Mul_17_param0"]
        # New for W2A1
        tensor_names_to_adjust.extend(["Mul_1_param0"])
        # Value to set to
        target_value = 1.
        # Check that these tensores actually exist and are of right shape
        init_names = [x.name for x in model.graph.initializer if x.name in tensor_names_to_adjust]
        for name in init_names:
            assert model.get_initializer(name).size == 1, "Multiplication is not a single value, but it should!"
            # Set tensor to target value, while retaining the datatype
            init_arr = model.get_initializer(name) 
            target_dtype = init_arr.dtype
            new_init_arr = np.array(target_value, dtype=target_dtype)
            model.set_initializer(name, new_init_arr)
    
    model.save(get_checkpoint_name("export_2"))

    model = ModelWrapper(get_checkpoint_name("export_2"))
    #model = model.transform(DoubleToSingleFloat())
    model = model.transform(InferShapes())
    model = model.transform(FoldConstants())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(GiveReadableTensorNames())
    model = model.transform(InferDataTypes())
    model = model.transform(RemoveStaticGraphInputs())
    model.save(get_checkpoint_name("tidy"))
    return model
    

def add_pre_and_postproc(get_checkpoint_name):
    prev_chkpt_name = get_checkpoint_name("tidy")
    model = ModelWrapper(prev_chkpt_name)
    global_inp_name = model.graph.input[0].name
    ishape = model.get_tensor_shape(global_inp_name)
    # preprocessing: torchvision's ToTensor divides uint8 inputs by 255
    totensor_pyt = ToTensor()
    chkpt_preproc_name = get_checkpoint_name("preproc")
    bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)
    assert os.path.isfile(chkpt_preproc_name)
    # join preprocessing and core model
    pre_model = ModelWrapper(chkpt_preproc_name)
    model = model.transform(MergeONNXModels(pre_model))
    # add input quantization annotation: UINT8 for all BNN-PYNQ models
    global_inp_name = model.graph.input[0].name
    model.set_tensor_datatype(global_inp_name, DataType.UINT8)
    # postprocessing: insert Top-1 node at the end
    model = model.transform(InsertTopK(k=1))
    chkpt_name = get_checkpoint_name("pre_post")
    # tidy-up again
    model = model.transform(InferShapes())
    model = model.transform(FoldConstants())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(GiveReadableTensorNames())
    model = model.transform(InferDataTypes())
    model = model.transform(RemoveStaticGraphInputs())
    model.save(chkpt_name)
    return model
    
def streamline_model(get_checkpoint_name):
    # check if target file already exists
    target_file = get_checkpoint_name("streamline")
    model = ModelWrapper(get_checkpoint_name("pre_post"))
    # move past any reshapes to be able to streamline input scaling
    model = model.transform(MoveScalarLinearPastInvariants())
    model = model.transform(Streamline())
    # only doing cnv networks
    #if "fc" not in topology:
    model = model.transform(LowerConvsToMatMul())
    model = model.transform(MakeMaxPoolNHWC())
    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
    
    model = model.transform(ConvertBipolarMatMulToXnorPopcount())
    model = model.transform(Streamline())
    # absorb final add-mul nodes into TopK
    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
    model = model.transform(InferDataLayouts())
    model = model.transform(RemoveUnusedTensors())
    model.save(target_file)
    return model

def convert_to_hls_layers(get_checkpoint_name, SWG_SIMD_list = None, pruning_ratio = None, pruning_mode = None, mem_mode = "decoupled"):    
    prev_chkpt_name = get_checkpoint_name("streamline")
    model = ModelWrapper(prev_chkpt_name)
    # needed for bipolar MatMul layers
    model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
    # needed for non-bipolar MatMul layers
    model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
    # TopK to LabelSelect
    model = model.transform(to_hls.InferLabelSelectLayer())
    # input quantization (if any) to standalone thresholding
    model = model.transform(to_hls.InferThresholdingLayer())
    # needed for convolutions
    # only doing cnv networks
    #if "fc" not in topology:
    # Introduce pruned layers
    if pruning_ratio == None:
        model = model.transform(to_hls.InferConvInpGen())
    else:
        if pruning_mode == "coarse":
            model = model.transform(to_hls.InferConvInpGenPruned(pruning_ratio, adjust_following_MVAU=True, SIMD_list=SWG_SIMD_list))
        elif pruning_mode == "fine":
            model = model.transform(to_hls.InferConvInpGenSIMDPruned(SWG_SIMD_list, pruning_ratio, adjust_following_MVAU=True))
        else:
            raise ValueError(f"Pruning mode {pruning_mode} not supported!")
    model = model.transform(to_hls.InferStreamingMaxPool())
    model = model.transform(RemoveCNVtoFCFlatten())
    
    # get rid of Tranpose -> Tranpose identity seq
    model = model.transform(absorb.AbsorbConsecutiveTransposes())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(InferDataLayouts())
    model.save(get_checkpoint_name("convert_to_hls_layers"))
    return model

def creat_dataflow_model(get_checkpoint_name):
    prev_chkpt_name = get_checkpoint_name("convert_to_hls_layers")
    model = ModelWrapper(prev_chkpt_name)
    parent_model = model.transform(CreateDataflowPartition())
    parent_model_chkpt = get_checkpoint_name("dataflow_parent")
    parent_model.save(parent_model_chkpt)
    sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
    sdp_node = getCustomOp(sdp_node)
    dataflow_model_filename = sdp_node.get_nodeattr("model")
    dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
    dataflow_model_chkpt = get_checkpoint_name("dataflow_model")
    dataflow_model.save(dataflow_model_chkpt)
    return dataflow_model

def configure_folding(get_checkpoint_name, folding):
    prev_chkpt_name = get_checkpoint_name("dataflow_model")
    model = ModelWrapper(prev_chkpt_name)
    
    model = model.transform(GiveUniqueNodeNames())
    fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
    for fcl in fc_layers:
        name = fcl.name
        pe, simd, ififodepth, ramstyle = folding[name]
        fcl_inst = getCustomOp(fcl)

        # Check compatibility
        mw = fcl_inst.get_nodeattr("MW")
        mh = fcl_inst.get_nodeattr("MH")
        if not (mh % pe == 0):
            raise ValueError(f'FINN requirement "MH divisable by PE" is violated. For layer: {name}; with MH: {mh}, PE: {pe}')
        if not (mw % simd == 0):
            raise ValueError(f'FINN requirement "MW divisable by SIMD" is violated. For layer: {name}; with MW: {mw}, SIMD: {simd}')
        if not (simd > (mw / 1024)):
            raise ValueError(f'Vivado requirement "SIMD > MW / 1024" is violated. For layer: {name}; with MW: {mw}, SIMD: {simd}')

        # Apply parameters
        fcl_inst.set_nodeattr("PE", pe)
        fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
        fcl_inst.set_nodeattr("ram_style", ramstyle)
        # Only adjust SIMD, if it wasn't in previously changed (by the pruning)
        # Check that the privious input (im2col) was not a pruned layer
        if not ("Pruned" in model.find_direct_predecessors(fcl)[0].name):
            # Only apply SIMD, if the layer wasn't changed by the pruning layer
            fcl_inst.set_nodeattr("SIMD", simd)

    # use same SIMD values for the sliding window operators
    folding_list = list(folding.values())
    swg_idepth = [2, 51, 9, 106, 2, 2]
    swg_layer_name_list = ["ConvolutionInputGenerator", "ConvolutionInputGeneratorPruned", "ConvolutionInputGeneratorSIMDPruned"]
    for swg_layer_name in swg_layer_name_list:
        swg_layers = model.get_nodes_by_op_type(swg_layer_name)
        for i in range(len(swg_layers)):
            swg_inst = getCustomOp(swg_layers[i])
            # Apply parameters            
            # SIMD was already applied for the pruned swg nodes
            if not ("Pruned" in swg_layer_name):
                ifm_ch = swg_inst.get_nodeattr("IFMChannels")
                simd = folding_list[i][1]
                # Check compatibility
                if not (ifm_ch % simd == 0):
                    raise ValueError(f'FINN requirement "IFMChannels divisable by SIMD" is violated. For layer: {name}; with IFMChannels: {ifm_ch}, SIMD: {simd}')
                swg_inst.set_nodeattr("SIMD", simd)
            swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i])

    model = model.transform(GiveUniqueNodeNames())
    model.save(get_checkpoint_name("fold"))
    return model

def build_for_hardware(get_checkpoint_name, test_pynq_board, target_clk_ns, auto_set_FIFO_depth):
    prev_chkpt_name = get_checkpoint_name("fold")
    model = ModelWrapper(prev_chkpt_name)
    model = model.transform(ZynqBuild(platform = test_pynq_board, 
                                      period_ns = target_clk_ns, 
                                      auto_set_FIFO_depth = auto_set_FIFO_depth
                                     ))
    model.save(get_checkpoint_name("synth"))
    return model

def throughput_rtl_sim(get_checkpoint_name, test_pynq_board, target_clk_ns):
    test_fpga_part = pynq_part_map[test_pynq_board]
    
    # test_ipgen
    prev_chkpt_name = get_checkpoint_name("fold")
    model = ModelWrapper(prev_chkpt_name)
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model.save(get_checkpoint_name("ipgen_rtlsim"))
    
    # test_ipstitch_rtlsim
    prev_chkpt_name = get_checkpoint_name("ipgen_rtlsim")
    model = ModelWrapper(prev_chkpt_name)
    model = model.transform(InsertDWC())
    model = model.transform(InsertFIFO())
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(AnnotateCycles())
    perf = model.analysis(dataflow_performance)
    latency = perf["critical_path_cycles"]
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
    model = model.transform(PrepareRTLSim())
    model.set_metadata_prop("exec_mode", "rtlsim")
    os.environ["LIVENESS_THRESHOLD"] = str(int(latency * 1.1))
    vcdf = get_checkpoint_name("vcd") + ".vcd"
    model.set_metadata_prop("rtlsim_trace", vcdf)
    os.environ["RTLSIM_TRACE_DEPTH"] = "3"
    rtlsim_chkpt = get_checkpoint_name("ipstitch_rtlsim")
    model.save(rtlsim_chkpt)
    parent_chkpt = get_checkpoint_name("dataflow_parent")
    input_tensor_npy = get_example_input("cnv")
    y = execute_parent(parent_chkpt, rtlsim_chkpt, input_tensor_npy)
    model = ModelWrapper(rtlsim_chkpt)
    perf["cycles_rtlsim"] = model.get_metadata_prop("cycles_rtlsim")
    
    # test_throughput_rtlsim
    prev_chkpt_name = get_checkpoint_name("ipstitch_rtlsim")
    model = ModelWrapper(prev_chkpt_name)
    n_nodes = len(model.graph.node)
    perf_est = model.analysis(dataflow_performance)
    latency = int(model.get_metadata_prop("cycles_rtlsim"))
    cycles_per_sample_est = perf_est["max_cycles"]
    batchsize = 2 * n_nodes
    # limit maximum batchsize
    if batchsize > 100:
        batchsize = 100
    ret = throughput_test_rtlsim(model, batchsize=batchsize)
    res_cycles = ret["cycles"]
    est_cycles = latency + cycles_per_sample_est * batchsize
    
    rtl_throughput_dict = {
        "perf":perf, 
        "latency":latency,
        "n_nodes":n_nodes, 
        "batchsize": batchsize,
        "perf_est":perf_est, 
        "latency":latency, 
        "cycles_per_sample_est":cycles_per_sample_est, 
        "ret":ret, 
        "res_cycles":res_cycles,
        "est_cycles":est_cycles,
    }
    
    return model, rtl_throughput_dict


def extract_fifo_max(get_checkpoint_name):
    vcdf = get_checkpoint_name("vcd") + ".vcd"
    stream_ifs = vcd.list_stream_if(vcdf)
    fifos = vcd.list_fifo_count_signals(vcdf)
    fifo_max = vcd.get_all_fifo_count_max(vcdf)
    return fifo_max

def extract_stream_stats(get_checkpoint_name):
    vcdf = get_checkpoint_name("vcd") + ".vcd"
    stream_stats = get_all_stream_if_stats(vcdf)
    return stream_stats

def delete_vcd_file(get_checkpoint_name):
    vcdf = get_checkpoint_name("vcd") + ".vcd"
    if os.path.exists(vcdf):
        os.remove(vcdf)


def extract_synth_and_ppr_timing_resources(get_checkpoint_name):
    # Get the paths to the different report files
    model = ModelWrapper(get_checkpoint_name("synth"))
    vivado_proj = model.get_metadata_prop("vivado_pynq_proj")
    # const parameter
    ppr_report_path = vivado_proj + "/finn_zynq_link.runs/impl_1/top_wrapper_utilization_placed.rpt"
    synth_report_path = vivado_proj + "/synth_report.xml"
    timing_report_path = vivado_proj + "/finn_zynq_link.runs/impl_1/top_wrapper_timing_summary_routed.rpt"
    
    # Read post place utilization report
    with open(ppr_report_path) as f:
        content = f.readlines()
    
    top_levels_to_parse = ["1. Slice Logic", "2. Slice Logic Distribution", "3. Memory", "4. DSP", "6. Clocking"]
    new_top_levels = ["1. CLB Logic", "2. CLB Logic Distribution", "3. BLOCKRAM", "4. ARITHMETIC", "6. CLOCK"]
    top_levels_to_parse.extend(new_top_levels)

    utilization_data = {}
    top_level_key = ""
    parse_enable = False
    waiting_for_table_start = False

    for line in content:
        # check if we stumbled upon one of the top level indicators
        if any(top in line for top in top_levels_to_parse):
            waiting_for_table_start = True
            # Find out which of them it was
            for i in range(len(top_levels_to_parse)):
                if top_levels_to_parse[i] in line:
                    top_level_key = top_levels_to_parse[i]
                    break
            utilization_data[top_level_key] = []

        # Check for table start indicator
        if waiting_for_table_start:
            if "Available" in line:
                parse_enable = True
                waiting_for_table_start = False
                continue

        # parse a line
        if parse_enable:
            # reached a table border
            if "+--" in line:
                continue
            # reached end of table
            if "\n" == line or "* Note: Each Block RAM Tile only h" in line:
                parse_enable = False
                continue
            # parse table row
            line = line.strip()
            split_line = line.split("|")[1:-1]
            row_data = []
            for data_snipplet in split_line:
                d = data_snipplet.strip()
                try:
                    d = float(d)
                except ValueError:
                    pass
                row_data.append(d)
            # skip rows with emtpy elements
            if '' in row_data:
                continue
            utilization_data[top_level_key].append(row_data)
    
    # Read the synthesis report
    root = ET.parse(synth_report_path).getroot()

    # Read the table header of the synthesis report
    table_header = []
    for child in root[0][0][0]:
        attribs = child.attrib
        table_header.append(attribs['contents'])

    # Read the rest of the table
    table_cols = []
    for child1 in root[0][0][1:]:
        one_col = []
        for child2 in child1:
            content = child2.attrib['contents'].strip()
            # Everything that looks like an int should become an int
            try:
                content = int(content)
            except ValueError:
                pass
            one_col.append(content)
        table_cols.append(one_col)


    # Store data in a nicer format
    report_data = pd.DataFrame(table_cols, columns=table_header)
    
    # Get timing data

    timing_data = {}

    with open(timing_report_path) as f:
        content = f.readlines()

    found_start = False
    lines_to_header = 5
    lines_to_content = 7
    data_title = ""

    line_count = 0
    for line in content:
        if ("Design Timing Summary" in line) or ("Clock Summary" in line):
            found_start = True
            data_title = line[1:].strip()
        if found_start:
            line_count += 1
        if line_count == lines_to_header:
            headers = line.strip().split("  ")
            headers = list(filter(lambda x: not (x == ""), headers ))
        if line_count == lines_to_content:
            data_part = line.strip().split("  ")
            data_part = list(filter(lambda x: not (x == ""), data_part))
            #data_part = [float(x) for x in data_part]

            # reset start finder
            found_start =  False
            line_count = 0

            # Save results
            res_dict = {}
            for i in range(len(headers)):
                try:
                    res_dict[headers[i]] = float(data_part[i])
                except ValueError:
                    res_dict[headers[i]] = data_part[i]
            timing_data[data_title] = res_dict
    
    return utilization_data, report_data, timing_data


def run_throughput_test_on_HW(get_checkpoint_name, HW_lock, ip = os.getenv("PYNQ_IP", "192.168.1.106"),
                                username = os.getenv("PYNQ_USERNAME", "xilinx"),
                                password = os.getenv("PYNQ_PASSWORD", "xilinx"),
                                port = os.getenv("PYNQ_PORT", 22),
                                target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn"),
                                timeout=None,
                             ):
    ret_str, ret = "", {}
    largest_bsize = 0
    # Acquire the HW lock first, so that we make sure to only run one throughput test at a time
    with HW_lock:
        # Deploy to FPGA board
        # test_deploy
        model = ModelWrapper(get_checkpoint_name("synth"))
        model = model.transform(
            DeployToPYNQ(
                ip,
                port,
                username,
                password,
                target_dir,
                timeout=timeout,
            )
        )
        # save the model to be able to link it to the parent
        model.save(get_checkpoint_name("deploy"))
        
        # test_throughput_hw
        test_name = get_checkpoint_name("", get_full_path=False)
        model = ModelWrapper(get_checkpoint_name("deploy"))
        ret = dict()
        # try a range of batch sizes, some may fail due to insufficient DMA
        # buffers
        bsize_range_in = [8 ** i for i in range(5)]
        bsize_range = []
        largest_bsize = None
        for bsize in bsize_range_in:
            res = throughput_test_remote(model, bsize, timeout=timeout)
            #res = throughput_test_remote(model, bsize)
            if res is not None:
                ret[bsize] = res
                bsize_range.append(bsize)
                largest_bsize = bsize
            else:
                # assume we reached largest possible N
                break
    
    HW_throughput_dict = {
        "ret":ret,
        "largest_bsize":largest_bsize,
    }
    return HW_throughput_dict

def delete_onnx_and_finn_artifacts(build_dir, print_name):
    left_onnx_files = glob(build_dir + f"/*{print_name}*.onnx")
    node_attr_to_check = ["code_gen_dir_ipgen", "rtlsim_so", "model"]
    model_attr_to_check = ["pynq_driver_dir", "vivado_pynq_proj", "vivado_stitch_proj", 
                           "rtlsim_so", "pynq_deploy_dir"]

    new_onnx_models_to_check = []
    last_folders_to_del = []

    for path_list in [left_onnx_files, new_onnx_models_to_check]:
        for path in path_list:
            try:
                curr_model = ModelWrapper(path)

                folders_to_del = []

                # Check out model
                for attr in model_attr_to_check:
                    #try:
                    res = curr_model.get_metadata_prop(attr)
                    if not res == None:
                        if attr == "rtlsim_so":
                                res = res.split("/")
                                res = "/".join(res[:-1])
                        folders_to_del.append(res)
                    #except AttributeError:
                    #    pass

                # Check out nodes
                for node in curr_model.get_finn_nodes():
                    n = getCustomOp(node)
                    for attr in node_attr_to_check:
                        try:
                            res = n.get_nodeattr(attr)
                            if attr == "rtlsim_so":
                                res = res.split("/")
                                res = "/".join(res[:-1])
                            if attr == "model":
                                new_onnx_models_to_check.append(res)
                                res = res.split("/")
                                res = "/".join(res[:-1])
                                last_folders_to_del.append(res)
                                break
                            folders_to_del.append(res)
                        except AttributeError:
                            pass

                # delete found folders
                for dirpath in folders_to_del:
                    if os.path.exists(dirpath) and os.path.isdir(dirpath):
                        shutil.rmtree(dirpath)

                # delete onnx file
                if os.path.exists(path):
                    os.remove(path)
            except AssertionError:
                # this happens when the model was added twice to the list and was already deleted, this happens sometimes
                pass

    # Do the final delete pass
    for dirpath in last_folders_to_del:
        if os.path.exists(dirpath) and os.path.isdir(dirpath):
            shutil.rmtree(dirpath)


def set_folding_for_model(folding, get_checkpoint_name, pruning_ratio = None, pruning_mode = None):
    # We need to generate the model almost from scratch, due to the SIMD parameter having an impact on the insertion of the pruned layers
    folding_list = list(folding.values())
    SWG_SIMD_list = [fold[1] for fold in folding_list]
    model = convert_to_hls_layers(get_checkpoint_name, SWG_SIMD_list = SWG_SIMD_list, pruning_ratio = pruning_ratio, pruning_mode = pruning_mode)
    model = creat_dataflow_model(get_checkpoint_name)
    model = configure_folding(get_checkpoint_name, folding)
    
    return model

def speed_up_layer(model, layer_name, folding, get_checkpoint_name, speedup_parameter="SIMD", pruning_ratio = None, pruning_mode = None):
    
    if speedup_parameter == "SIMD":
        paramter_index = 1
    elif speedup_parameter == "PE":
        paramter_index = 0
    else:
        raise ValueError("Can't optimize for paramter called " + speedup_parameter)
    
    speed_up_succesfull = True
    current_folding = copy.deepcopy(folding)
    adjusted_layer_folding = current_folding[layer_name]
    adjusted_layer_folding[paramter_index] *= 2
    current_folding[layer_name] = adjusted_layer_folding
    # try to apply the new folding
    try:
        model = set_folding_for_model(current_folding, get_checkpoint_name, pruning_ratio=pruning_ratio, pruning_mode=pruning_mode)
    except ValueError as e:
        speed_up_succesfull = False
        return speed_up_succesfull, folding
    return speed_up_succesfull, current_folding

def autotune_folding(inital_folding, get_checkpoint_name, chip_LUTs = 70560, max_LUT_utilization_ratio = 1.2,
                     parameter_priority = ["balanced"],
                     improve_non_critical_layers = False,
                     maximum_iterations = 200,
                     verbose=False,
                     pruning_ratio = None, pruning_mode = None
                    ):
    # Initilize variables
    current_LUTs = 0.
    last_folding = copy.deepcopy(inital_folding)
    current_folding = copy.deepcopy(inital_folding)

    i = 0
    optimization_running = True
    maximum_folding_improvement = False

    # Start optimizing
    while((current_LUTs/chip_LUTs < max_LUT_utilization_ratio) and optimization_running):
        # when we arrive back here then the current folding becomes the last folding
        last_folding = copy.deepcopy(current_folding)

        # apply the last folding
        model = set_folding_for_model(last_folding, get_checkpoint_name, pruning_ratio = pruning_ratio, pruning_mode = pruning_mode)
        # calculate cycle statistics for the model
        cycles_dict = exp_cycles_per_layer(model)
        # Sort layers by latency
        cycles_sorted = dict(sorted(cycles_dict.items(), key=lambda item: item[1]))

        # select layer to improve 
        layer_index = -1
        while optimization_running:
            try:
                slowest_layer = list(cycles_sorted.keys())[layer_index]
            except IndexError:
                if verbose:
                    print("No more layers to optimize, ending optimization")
                optimization_running = False
                maximum_folding_improvement = True
                break
            slowest_cycles = cycles_sorted[slowest_layer]
            if verbose:
                print("\tImproving layer: ", slowest_layer)
            # make the layer faster
            layer_not_optimizable = False
            if "ConvolutionInputGenerator" in slowest_layer:
                # ConvolutionInputGenerators can only be tuned via the SIMD parameter
                # So we disregard the PE parameter here and only try to improve the SIMD parameter
                layer_name = slowest_layer.split("_")[1]
                layer_name = "StreamingFCLayer_Batch_" + layer_name
                speed_up_succesfull, current_folding = speed_up_layer(model, layer_name, current_folding, get_checkpoint_name, speedup_parameter="SIMD", pruning_ratio=pruning_ratio, pruning_mode=pruning_mode)
                if not speed_up_succesfull:
                    layer_not_optimizable = True
            elif "StreamingFCLayer_Batch" in slowest_layer:
                # This is for the normal StreamingFCLayer_Batch layers
                if parameter_priority[0] == "balanced":
                    # Balnced will try to adjust SIMD and PE to be equal
                    # But SIMD will get increased before PE if both are already equal
                    adjusted_layer_folding = current_folding[slowest_layer]
                    if adjusted_layer_folding[1] > adjusted_layer_folding[0]:
                        p_priority = ["PE", "SIMD"]
                    else:
                        p_priority = ["SIMD", "PE"]
                else:
                    p_priority = parameter_priority
                for parameter in p_priority:
                    speed_up_succesfull, current_folding = speed_up_layer(model, slowest_layer, current_folding, get_checkpoint_name, speedup_parameter=parameter, pruning_ratio=pruning_ratio, pruning_mode=pruning_mode)
                    if speed_up_succesfull:
                        break
                if not speed_up_succesfull:
                    layer_not_optimizable = True
            else:
                # Layer not recongnized
                if improve_non_critical_layers:
                    if verbose:
                        print("Layer not recognized: ", slowest_layer, " trying next")
                    layer_index -= 1
                    continue
                else:
                    if verbose:
                        print("Layer not recognized: ", slowest_layer, " ending optimization")
                    optimization_running = False
                    break

            if layer_not_optimizable:
                if improve_non_critical_layers:
                    if verbose:
                        print("No further improvement available for: ", slowest_layer, " trying to improve the next best one")
                    layer_index -= 1
                    continue
                else:
                    if verbose:
                        print("No further improvement available for: ", slowest_layer, " ending optimization")
                    optimization_running = False
                    break
            else:
                break

        # try to apply the new folding
        model = set_folding_for_model(current_folding, get_checkpoint_name, pruning_ratio = pruning_ratio, pruning_mode = pruning_mode)

        # Calculate the resource utilization for the current model
        res_dict = res_estimation(model)
        current_LUTs = sum([v['LUT'] for v in res_dict.values()])
        if verbose:
            print(f"current_LUTs={current_LUTs} of ultra96_maxLUT={chip_LUTs:.0f} \t-> percent: {100*current_LUTs/chip_LUTs:.1f} [%]")

        i += 1
        if i >= maximum_iterations:
            if verbose:
                print("Reached maximum number of iterations, ending optimization")
            break

    # When the loop ends, then the last optimization overshot the resoruce budged
    # This means that we can save the "last_folding" as the final result
    if verbose:
        print(f"Optimization ended after {i} iterations")
    model = set_folding_for_model(last_folding, get_checkpoint_name, pruning_ratio = pruning_ratio, pruning_mode = pruning_mode)
    return model, last_folding, maximum_folding_improvement

def assemble_inital_folding(pruning_ratio, pruning_mode):
    # find the inital folding depending on the pruning mode and ratio
    if pruning_mode == None:
        inital_folding_none = {
            "StreamingFCLayer_Batch_0": [1, 3, 256, "auto"],
            "StreamingFCLayer_Batch_1": [1, 1, 256, "auto"],
            "StreamingFCLayer_Batch_2": [1, 1, 256, "auto"],
            "StreamingFCLayer_Batch_3": [1, 2, 256, "block"],
            "StreamingFCLayer_Batch_4": [1, 2, 214, "auto"],
            "StreamingFCLayer_Batch_5": [1, 4, 2, "auto"],
            "StreamingFCLayer_Batch_6": [1, 1, 126, "distributed"],
            "StreamingFCLayer_Batch_7": [1, 1, 62, "block"],
            "StreamingFCLayer_Batch_8": [5, 1, 6, "distributed"],
        }
        inital_folding = inital_folding_none
    elif pruning_mode == "coarse":
        inital_folding_coarse = {
            "StreamingFCLayer_Batch_0": [1, 3, 256, "auto"],
            "StreamingFCLayer_Batch_1": [1, 1, 256, "auto"],
            "StreamingFCLayer_Batch_2": [1, 1, 256, "auto"],
            "StreamingFCLayer_Batch_3": [1, 1, 256, "block"],
            "StreamingFCLayer_Batch_4": [1, 1, 214, "auto"],
            "StreamingFCLayer_Batch_5": [1, 4, 2, "auto"],
            "StreamingFCLayer_Batch_6": [1, 1, 126, "distributed"],
            "StreamingFCLayer_Batch_7": [1, 1, 62, "block"],
            "StreamingFCLayer_Batch_8": [5, 1, 6, "distributed"],
        }
        inital_folding = inital_folding_coarse
    elif pruning_mode == "fine":
        #supported_p_r = [1-(1/(2**i)) for i in range(1,4)]
        supported_p_r = [0.5, 0.75, 0.875]
        if pruning_ratio == 0.5:
            SIMD = 2
        elif pruning_ratio == 0.75:
            SIMD = 4
        elif pruning_ratio == 0.875:
            SIMD = 8
        else:
            raise ValueError(f"Pruning ratio {pruning_ratio} not supported! Only: {supported_p_r} are supported for the fine method at this time.")
        inital_folding_fine = {
            "StreamingFCLayer_Batch_0": [1, 3, 256, "auto"],
            "StreamingFCLayer_Batch_1": [1, SIMD, 256, "auto"],
            "StreamingFCLayer_Batch_2": [1, SIMD, 256, "auto"],
            "StreamingFCLayer_Batch_3": [1, SIMD, 256, "block"],
            "StreamingFCLayer_Batch_4": [1, SIMD, 214, "auto"],
            "StreamingFCLayer_Batch_5": [1, SIMD, 2, "auto"],
            "StreamingFCLayer_Batch_6": [1, 1, 126, "distributed"],
            "StreamingFCLayer_Batch_7": [1, 1, 62, "block"],
            "StreamingFCLayer_Batch_8": [5, 1, 6, "distributed"],
        }
        inital_folding = inital_folding_fine
    else:
        raise ValueError(f"Pruning mode {pruning_mode} not supported!")    
    
    return inital_folding

def get_dataflow_model_strings(get_checkpoint_name):
    dataflow_model_strings = []
    prev_chkpt_name = get_checkpoint_name("synth")
    synth_model = ModelWrapper(prev_chkpt_name)
    for node in synth_model.get_nodes_by_op_type("StreamingDataflowPartition"):
        node_inst = getCustomOp(node)
        df_model = ModelWrapper(node_inst.get_nodeattr('model'))
        dataflow_model_strings.append(str(df_model.model))
    return dataflow_model_strings

In [8]:
def run_one_test(pruning_ratio, pruning_mode, wbits, abits, target_clk_ns, 
                 test_pynq_board, HW_lock, build_dir, topology, 
                 timeout=120, max_LUT_utilization_ratio_start = 1.4, parameter_priority = ["SIMD", "PE"],
                 auto_set_FIFO_depth = False,
                 start_export_data = None,
                ):
    save_file_prefix = generate_save_file_prefix(pruning_ratio, pruning_mode, target_clk_ns, auto_set_FIFO_depth)
    get_checkpoint_name = get_checkpoint_name_generator(build_dir, save_file_prefix, topology, wbits, abits, parameter_priority)
    print_name = get_checkpoint_name("", get_full_path=False)
    num_workers = read_num_workers_from_disk()
    os.environ['NUM_DEFAULT_WORKERS'] = str(num_workers)

    msg = "Started test for: {}, and num_workers: {}"
    msg = msg.format(print_name, num_workers)
    print(msg)

    # Get folding to start with
    inital_folding = assemble_inital_folding(pruning_ratio, pruning_mode)
    
    max_LUT_utilization_ratio = max_LUT_utilization_ratio_start
    running = True
    first_run = True
    first_run_succeded = True
    if not (start_export_data == None):
        export_data = copy.deepcopy(start_export_data)
    else:
        export_data = {
            "largest PPRing max_LUT": 0.0,
            'PPR optimization successfull': False,
            'last max_LUT for resuming': 0.,
        }
    
    while(running):
        # check if the run failed over all
        if max_LUT_utilization_ratio <= 0.25:
            running = False
            print(f"\t{print_name}: PPR failed too many times, ending here.")
            break
        # Initalize vars
        no_synth_error = True
        # run test
        print(f"\t{print_name}: load_bit_model")
        model = load_bit_model(get_checkpoint_name, wbits, abits)
        print(f"\t{print_name}: add_pre_and_postproc")
        model = add_pre_and_postproc(get_checkpoint_name)
        print(f"\t{print_name}: streamline_model")
        model = streamline_model(get_checkpoint_name)
        print(f"\t{print_name}: autotune_folding with max_LUT_utilization_ratio={max_LUT_utilization_ratio:.2f}")
        model, final_folding, maximum_folding_improvement = autotune_folding(inital_folding, get_checkpoint_name,
                                                max_LUT_utilization_ratio = max_LUT_utilization_ratio,
                                                parameter_priority = parameter_priority,
                                                improve_non_critical_layers = True,
                                                maximum_iterations = 200,
                                                verbose = False,
                                                pruning_ratio = pruning_ratio, pruning_mode = pruning_mode
                                               )
        print(f"\t{print_name}: {final_folding}")

        print(f"\t{print_name}: configure_folding")
        model = configure_folding(get_checkpoint_name, final_folding)
        print(f"\t{print_name}: build_for_hardware")
        # Get num workers again, to make sure we are up to date
        num_workers = read_num_workers_from_disk()
        os.environ['NUM_DEFAULT_WORKERS'] = str(num_workers)
        print(f"\t{print_name}: set env num workers to: {os.environ['NUM_DEFAULT_WORKERS']}")
        
        build_error = None
        ppr_report_error = None
        rtl_sim_error = None
        HW_test_error = None
        dataflow_model_strings = None
        try:
            model = build_for_hardware(get_checkpoint_name,
                                       test_pynq_board,
                                       target_clk_ns,
                                       auto_set_FIFO_depth,
                                      )
        except (Exception, AssertionError) as e:
            print(f"\t{print_name}: No dice with synthesis, skipping to the end")
            print(e)
            build_error = repr(e)
            try:
                print(f"\t{print_name}: extract_synth_and_ppr_timing_resources")
                utilization_data, report_data, timing_data = extract_synth_and_ppr_timing_resources(get_checkpoint_name)
                report_data = report_data.to_json()
            except (FileNotFoundError, AssertionError) as e:
                print(f"\t{print_name}: No dice with synthesis, skipping")
                print(e)
                ppr_report_error = repr(e)
                utilization_data = None
                report_data = None
                timing_data = None
                HW_throughput_dict = None
            rtl_throughput_dict = None
            fifo_max = None
            stream_stats = None
            HW_throughput_dict = None
            # make sure to skip the rest
            no_synth_error = False
        
        if no_synth_error:
            try:
                print(f"\t{print_name}: Skipping throughput_rtl_sim")
                rtl_throughput_dict = None
                fifo_max = None
                stream_stats = None
                #print(f"\t{print_name}: throughput_rtl_sim")
                # Get num workers again, to make sure we are up to date
                #num_workers = read_num_workers_from_disk()
                #os.environ['NUM_DEFAULT_WORKERS'] = str(num_workers)
                #model, rtl_throughput_dict = throughput_rtl_sim(get_checkpoint_name, test_pynq_board, target_clk_ns)
                # Get num workers again, to make sure we are up to date
                #num_workers = read_num_workers_from_disk()
                #os.environ['NUM_DEFAULT_WORKERS'] = str(num_workers)
                #print(f"\t{print_name}: extract_fifo_max")
                #fifo_max = extract_fifo_max(get_checkpoint_name)
                #print(f"\t{print_name}: extract_stream_stats")
                #stream_stats = extract_stream_stats(get_checkpoint_name)
                #print(f"\t{print_name}: delete_vcd_file")
                #delete_vcd_file(get_checkpoint_name)
            except FileNotFoundError as e:
                print(f"\t{print_name}: No dice with rtl sim, skipping")
                print(e)
                rtl_sim_error = repr(e)
                rtl_throughput_dict = None
                fifo_max = None
                stream_stats = None
            try:
                print(f"\t{print_name}: extract_synth_and_ppr_timing_resources")
                utilization_data, report_data, timing_data = extract_synth_and_ppr_timing_resources(get_checkpoint_name)
                report_data = report_data.to_json()
                print(f"\t{print_name}: get_dataflow_model_strings")
                dataflow_model_strings = get_dataflow_model_strings(get_checkpoint_name)
                try:
                    print(f"\t{print_name}: run_throughput_test_on_HW")
                    HW_throughput_dict = run_throughput_test_on_HW(get_checkpoint_name, HW_lock,
                                                                   ip="ultra96-v2.ziti.uni-heidelberg.de",
                                                                   timeout=timeout,
                                                                  )
                except TimeoutExpired as e:
                    print(f"\t{print_name}: HW test timed out")
                    HW_test_error = repr(e)
                    HW_throughput_dict = None
            except (FileNotFoundError, AssertionError) as e:
                print(f"\t{print_name}: No dice with synthesis, skipping")
                print(e)
                ppr_report_error = repr(e)
                utilization_data = None
                report_data = None
                timing_data = None
                HW_throughput_dict = None


        # Do final clean up
        print(f"\t{print_name}: delete_onnx_and_finn_artifacts")
        delete_onnx_and_finn_artifacts(build_dir, print_name)

        # save data to disk
        data = {
            "FINN version": "0.4b-dev",
            "test_pynq_board": test_pynq_board,
            "target_clk_ns": target_clk_ns,
            "save_file_prefix": save_file_prefix,
            "topology": topology, 
            "wbits": wbits, 
            "abits": abits,
            "folding": final_folding,
            "print_name": print_name,
            "rtl_throughput_dict": rtl_throughput_dict,
            "fifo_max": fifo_max,
            "stream_stats": stream_stats,
            "PPR_utilization": utilization_data,
            "SYNTH_utilization": report_data,
            "timing_data": timing_data,
            "HW_throughput_dict": HW_throughput_dict,
            "max_LUT_utilization_ratio": max_LUT_utilization_ratio,
            "parameter_priority": parameter_priority,
            "inital_folding": inital_folding,
            "auto_set_FIFO_depth": auto_set_FIFO_depth,
            "pruning_ratio": pruning_ratio, 
            "pruning_mode": pruning_mode,
            "scan_script_version": scan_script_version,
            "maximum_folding_improvement": maximum_folding_improvement,
            "build_error": build_error,
            "ppr_report_error": ppr_report_error,
            "rtl_sim_error": rtl_sim_error,
            "HW_test_error": HW_test_error,
            "dataflow_model_strings": dataflow_model_strings,
        }
        
        # Check if the run failed or succeded
        run_succeded = True
        if no_synth_error == False:
            run_succeded = False
        
        # Update max value if run succeded
        if run_succeded:
            export_data["largest PPRing max_LUT"] = max_LUT_utilization_ratio
        # Append data to export data and save result
        export_data[str(max_LUT_utilization_ratio)] = data
        print(export_data.keys())
        # Add data for resuming a run
        export_data['last max_LUT for resuming'] = max_LUT_utilization_ratio
        
        # Handle differently if this was the first run
        if first_run:
            if run_succeded:
                max_LUT_utilization_ratio += 0.1
                print(f"\t{print_name}: PPR succeded on first try, running with higher max_LUT_utilization_ratio={max_LUT_utilization_ratio:.2f}")
            else:
                max_LUT_utilization_ratio -= 0.1
                print(f"\t{print_name}: PPR failed on first try, running with lower max_LUT_utilization_ratio={max_LUT_utilization_ratio:.2f}")
            first_run = False
            first_run_succeded = run_succeded
        else:
            # Check if we reached a cross over point
            # Or if we are still on the same trajectory, then we need to further increase/decrease the max_LUT_utilization_ratio
            if first_run_succeded == run_succeded:
                if run_succeded:
                    # Stop here if we maxed out the folding
                    if maximum_folding_improvement:
                        export_data['PPR optimization successfull'] = True
                        running = False
                        print(f"\t{print_name}: PPR succeded on current try and maxed out folding improvements, stopping here. Last max_LUT_utilization_ratio={max_LUT_utilization_ratio:.2f}")
                    else:
                        max_LUT_utilization_ratio += 0.1
                        print(f"\t{print_name}: PPR succeded on current try, running with higher max_LUT_utilization_ratio={max_LUT_utilization_ratio:.2f}")
                else:
                    max_LUT_utilization_ratio -= 0.1
                    print(f"\t{print_name}: PPR failed on current try, running with lower max_LUT_utilization_ratio={max_LUT_utilization_ratio:.2f}")
            else:
                # We arrived at the corss over point, so we can stop here.
                print(f"\t{print_name}: Arrived at cross over point, stoping here. Last max_LUT_utilization_ratio={max_LUT_utilization_ratio:.2f}")
                export_data['PPR optimization successfull'] = True
                running = False
        
        print(f"\t{print_name}: Run ended, dumping data")
        result_file_name = build_dir + "/{}.json.gz".format(print_name)
        with gzip.open(result_file_name, "wt") as write_file:
            json.dump(export_data, write_file)

        result_file_name = "/workspace/finn" + "/{}.json.gz".format(print_name)
        with gzip.open(result_file_name, "wt") as write_file:
            json.dump(export_data, write_file)
        
        # check if the run failed over all
        # We do this at the top now to catch old settings coming from resuming old runs
        #if max_LUT_utilization_ratio <= 0.25:
        #    running = False
        #    print(f"\t{print_name}: PPR failed too many times, ending here.")

    print("Finished test for: {}".format(print_name))

    return export_data

def read_cpu_target_from_disk(cpu_percentace_max):
    with open(build_dir + '/cpu_percentage_max.txt', "r") as f:
        cpu_percentace_max_new = float(f.read())
    if not (cpu_percentace_max == cpu_percentace_max_new):
        print(f"cpu_percentace_max now: {cpu_percentace_max_new:.1f}")
    return cpu_percentace_max_new


def read_num_workers_from_disk():
    with open(build_dir + '/num_workers.txt', "r") as f:
        num_workers_new = int(f.read())
    return num_workers_new

In [9]:
def generate_save_file_prefix(pruning_ratio, pruning_mode, target_clk_ns, auto_set_FIFO_depth):
    save_file_prefix = ""
    if pruning_ratio == None:
        save_file_prefix = "autoTune_None_"
    else:
        save_file_prefix = "autoTune_{:.2f}_{}_".format(pruning_ratio, pruning_mode)
    if auto_set_FIFO_depth == True:
        save_file_prefix += "autoFIFO_"
    
    save_file_prefix += f"clk_ns_{target_clk_ns}"
    
    return save_file_prefix

## Settings

In [None]:
build_dir = "/workspace/finn"
build_dir = "/tmp/finn_dev_hendrik"
ENABLE_NETRON = False

In [10]:
# Setup the matrix of known LUT budgets for unpruned data
# Denoted as [W][A]
inital_LUT_budget = {
    1:{1:1.4, 2:1.2, 3:1.3, 4:1.0, 5:0.7},
    2:{2:1.1, 3:1.0, 4:0.9, 5:0.7},
    3:{2:0.9, 3:0.9, 4:0.7},
    4:{2:0.9, 3:0.9},
}
inital_LUT_budget_for_unknown = 0.5

In [11]:
test_pynq_board = "Pynq-Z1"
test_pynq_board = "Ultra96"
test_fpga_part = pynq_part_map[test_pynq_board]
target_clk_ns = 10

cpu_percentace_max = 94.0
auto_set_FIFO_depth = True

In [12]:
pruning_ratio_list = [0.25, 0.5, 0.75, 0.875]
#pruning_ratio_list = [0.5, 0.75, 0.875]
#pruning_ratio_list = [None]

pruning_mode_list = ["coarse"]
#pruning_mode_list = ["fine"]
#pruning_mode_list = [None]

bit_list = list(range(1, 6))

#parameter_priority_list = [["SIMD", "PE"],["PE", "SIMD"],["balanced"]]
#parameter_priority_list = [["SIMD", "PE"],["balanced"]]
parameter_priority_list = [["PE", "SIMD"],["balanced"]]

pruning_ratio_list, pruning_mode_list, bit_list, parameter_priority_list,  len(parameter_priority_list) * len(pruning_ratio_list) * (len(bit_list) **2) * len(pruning_mode_list)

([0.25, 0.5, 0.75, 0.875],
 ['coarse'],
 [1, 2, 3, 4, 5],
 [['PE', 'SIMD'], ['balanced']],
 200)

## Parallel implementation

In [None]:
# Parallel implementation

thread_list = []
HW_lock = Lock()
pruning_ratio = None

topology=  "cnv"
i = 0

for pruning_mode in pruning_mode_list:
    for weight_bit_width in bit_list:
        for act_bit_width in bit_list:
            for pruning_ratio in pruning_ratio_list:
                for parameter_priority in parameter_priority_list:
                    # start the next thread only when the required CPU resources become available
                    curr_cpu = 1000.
                    cpu_percentace_max = read_cpu_target_from_disk(cpu_percentace_max)
                    while (curr_cpu > cpu_percentace_max):
                        curr_cpu = psutil.cpu_percent(interval = 10.)
                        cpu_percentace_max = read_cpu_target_from_disk(cpu_percentace_max)
                        num_workers = read_num_workers_from_disk()
                        os.environ['NUM_DEFAULT_WORKERS'] = str(num_workers)
                    
                    # Try to get a sane estimate for the first max_LUT_utilization_ratio_start
                    try:
                        max_LUT_utilization_ratio_start = inital_LUT_budget[weight_bit_width][act_bit_width]
                    except KeyError:
                        max_LUT_utilization_ratio_start = inital_LUT_budget_for_unknown
                    
                    # For FIFO auto tuning, try to start off with already tested stuff
                    if auto_set_FIFO_depth:
                        save_file_prefix = generate_save_file_prefix(pruning_ratio, pruning_mode, target_clk_ns, False)
                        get_checkpoint_name = get_checkpoint_name_generator(build_dir, save_file_prefix, topology, weight_bit_width, act_bit_width, parameter_priority)
                        print_name = get_checkpoint_name("", get_full_path=False)
                        result_file_name = build_dir + "/{}.json.gz".format(print_name)
                        if os.path.isfile(result_file_name):
                            with gzip.open(result_file_name, 'rb') as json_file:
                                finn_data_raw = json.load(json_file)
                            max_LUT_utilization_ratio_start = finn_data_raw['largest PPRing max_LUT']
                            print("Found non-auto FIFO data from: {}, with max_LUT: {}".format(print_name, max_LUT_utilization_ratio_start))
                            # if the previous run had failed, we start out at 0.3, because there is little hope that this works any ways
                            if max_LUT_utilization_ratio_start == 0.:
                                max_LUT_utilization_ratio_start = 0.3
                    
                    
                    # Check if the output file already exists
                    save_file_prefix = generate_save_file_prefix(pruning_ratio, pruning_mode, target_clk_ns, auto_set_FIFO_depth)
                    get_checkpoint_name = get_checkpoint_name_generator(build_dir, save_file_prefix, topology, weight_bit_width, act_bit_width, parameter_priority)
                    print_name = get_checkpoint_name("", get_full_path=False)
                    result_file_name = build_dir + "/{}.json.gz".format(print_name)
                    start_export_data = None
                    if os.path.isfile(result_file_name):
                        print("Already exists for: {}".format(print_name))
                        # Check if we can run off of this data or if it was alrady completed
                        with gzip.open(result_file_name, 'rb') as json_file:
                            finn_data_raw = json.load(json_file)
                        # Handle old data format
                        try:
                            finished = finn_data_raw['PPR optimization successfull']
                        except KeyError:
                            finished = finn_data_raw["Finished PPR optimization"]
                        
                        if finished:
                            print("Already finished for: {}, continuing".format(print_name))
                            continue
                        else:
                            # Check if the optimization ended in a 0, then it's also finished
                            if finn_data_raw['largest PPRing max_LUT'] == 0.:
                                print("Already unseccessfully finished for: {}, continuing".format(print_name))
                                continue
                            else:
                                # resume the optimization here
                                start_export_data = finn_data_raw
                                if 'last max_LUT for resuming' in finn_data_raw.keys():
                                    max_LUT_utilization_ratio_start = finn_data_raw['last max_LUT for resuming']
                                else:
                                    lut_list = []
                                    for key in finn_data_raw.keys():
                                        try:
                                            lut_list.append(float(key))
                                        except ValueError:
                                            pass
                                    if max(lut_list) > max_LUT_utilization_ratio_start:
                                        max_LUT_utilization_ratio_start = max(lut_list)
                                    else:
                                        max_LUT_utilization_ratio_start = min(lut_list)
                                    print("Resuming for: {}, with max_LUT_utilization_ratio_start: {}".format(print_name, max_LUT_utilization_ratio_start))
                    
                    # kick off the next thread
                    num_workers = read_num_workers_from_disk()
                    os.environ['NUM_DEFAULT_WORKERS'] = str(num_workers)
                    args = (pruning_ratio, pruning_mode, weight_bit_width, act_bit_width, target_clk_ns, test_pynq_board, HW_lock, build_dir, topology)
                    kwargs = {
                        "parameter_priority": parameter_priority,
                        "max_LUT_utilization_ratio_start": max_LUT_utilization_ratio_start,
                        "auto_set_FIFO_depth": auto_set_FIFO_depth,
                        "start_export_data": start_export_data
                    }
                    print("Starting from main thread: {}".format(print_name))
                    t = Process(target=run_one_test, args=args, kwargs=kwargs)
                    t.start()
                    thread_list.append(t)
                    i += 1
                

In [14]:
# Wait for all threads to complete
number_p_alive = 1
while number_p_alive > 0:
    number_p_alive = 0
    for t in thread_list:
        if t.is_alive():
            number_p_alive += 1
    print(f"Processes running: {number_p_alive}     ", end="\r")
    time.sleep(1.)

# Join all threads
for t in thread_list: 
    t.join()
print("")
print("All threads finished")

Processes running: 0      
All threads finished


## Serial implementation for testing

In [None]:
# serial implementation
HW_lock = Lock()
test_pynq_board = "Ultra96"
target_clk_ns = 10
topology, weight_bit_width, act_bit_width =  "cnv", 1, 1

#pruning_ratio = 0.5
pruning_ratio = None
#pruning_mode = "fine"
pruning_mode = None

parameter_priority = parameter_priority_list[2]
max_LUT_utilization_ratio_start = 1.4

num_workers = read_num_workers_from_disk()
#os.environ['NUM_DEFAULT_WORKERS'] = str(num_workers)
os.environ['NUM_DEFAULT_WORKERS'] = str(20)
print(os.environ['NUM_DEFAULT_WORKERS'])

In [None]:
data = run_one_test(pruning_ratio, pruning_mode, weight_bit_width, act_bit_width, target_clk_ns, 
                    test_pynq_board, HW_lock, build_dir, topology, 
                    timeout=120, max_LUT_utilization_ratio_start = max_LUT_utilization_ratio_start,
                    parameter_priority = parameter_priority,
                    auto_set_FIFO_depth = True
                )