# Quantized Model Synthesis
This notebook will follow up from the end of the cleanup notebook to convert the model to a synthesized model using the build_dataflow tool

### Build Estimation
Estimate possible model deployment parameters without launching synthesis

In [1]:
#imports
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg

In [3]:
#parameters
target_fps = 100       #target inference in batches/sec
clk_per_ns = 200.0  #target clock period in ns
fpga_part = "xczu3eg-sbva484-1-e" #fpga generating bitfile for
model_file = "quant_models/facileV3_4b_1_tidy.onnx"
board="Ultra96"

In [8]:
estimates_output_dir = "output_estimates_only"

cfg = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    target_fps          = target_fps,
    synth_clk_period_ns = clk_per_ns,
    fpga_part           = fpga_part,   #ultra96 part
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)

build.build_dataflow_cfg(model_file, cfg)

Building dataflow accelerator from quant_models/facileV3_4b_1_tidy.onnx
Intermediate outputs will be generated in /tmp/finn_dev_mtrahms
Final outputs will be generated in output_estimates_only
Build log is at output_estimates_only/build_dataflow.log
Running step: step_tidy_up [1/7]
Running step: step_streamline [2/7]
Running step: step_convert_to_hls [3/7]
Running step: step_create_dataflow_partition [4/7]
Running step: step_target_fps_parallelization [5/7]
Running step: step_apply_folding_config [6/7]
Running step: step_generate_estimate_reports [7/7]
Completed successfully


0

In [9]:
! ls {estimates_output_dir}

intermediate_models  report  time_per_step.json


In [10]:
! ls {estimates_output_dir}/report

estimate_layer_config_alternatives.json  estimate_network_performance.json
estimate_layer_cycles.json		 op_and_param_counts.json
estimate_layer_resources.json


In [11]:
! cat {estimates_output_dir}/report/estimate_network_performance.json

{
  "critical_path_cycles": 812,
  "max_cycles": 434,
  "max_cycles_node_name": "StreamingFCLayer_Batch_0",
  "estimated_throughput_fps": 11520.737327188941,
  "estimated_latency_ns": 162400.0
}

In [12]:
import json
def read_json_dict(filename):
    with open(filename, "r") as f:
        ret = json.load(f)
    return ret

In [13]:
read_json_dict(estimates_output_dir + "/report/estimate_layer_cycles.json")

{'StreamingFCLayer_Batch_0': 434,
 'StreamingFCLayer_Batch_1': 341,
 'StreamingFCLayer_Batch_2': 33,
 'StreamingFCLayer_Batch_3': 3,
 'Thresholding_Batch_0': 1}

In [14]:
read_json_dict(estimates_output_dir + "/report/estimate_layer_resources.json")

{'StreamingFCLayer_Batch_0': {'BRAM_18K': 1,
  'BRAM_efficiency': 0.14127604166666666,
  'LUT': 2588,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingFCLayer_Batch_1': {'BRAM_18K': 1,
  'BRAM_efficiency': 0.11100260416666667,
  'LUT': 2727,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingFCLayer_Batch_2': {'BRAM_18K': 1,
  'BRAM_efficiency': 0.0107421875,
  'LUT': 2588,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingFCLayer_Batch_3': {'BRAM_18K': 1,
  'BRAM_efficiency': 0.0009765625,
  'LUT': 2308,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'Thresholding_Batch_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 6,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'total': {'BRAM_18K': 4.0, 'LUT': 10217.0, 'URAM': 0.0, 'DSP': 0.0}}

### Build
Launch a synthesis build in a similar way to the cells above.
This will launch Vivado and take a while.

In [15]:
rtlsim_output_dir = "output_ipstitch_ooc_rtlsim"

cfg = build.DataflowBuildConfig(
    output_dir          = rtlsim_output_dir,
    target_fps          = target_fps,
    synth_clk_period_ns = clk_per_ns,
    fpga_part           = fpga_part,
    generate_outputs=[
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
    ]
)

build.build_dataflow_cfg(model_file, cfg)

Building dataflow accelerator from quant_models/facileV3_4b_1_tidy.onnx
Intermediate outputs will be generated in /tmp/finn_dev_mtrahms
Final outputs will be generated in output_ipstitch_ooc_rtlsim
Build log is at output_ipstitch_ooc_rtlsim/build_dataflow.log
Running step: step_tidy_up [1/15]
Running step: step_streamline [2/15]
Running step: step_convert_to_hls [3/15]
Running step: step_create_dataflow_partition [4/15]
Running step: step_target_fps_parallelization [5/15]
Running step: step_apply_folding_config [6/15]
Running step: step_generate_estimate_reports [7/15]
Running step: step_hls_ipgen [8/15]
Running step: step_set_fifo_depths [9/15]
Running step: step_create_stitched_ip [10/15]
Running step: step_measure_rtlsim_performance [11/15]
Running step: step_make_pynq_driver [12/15]
Running step: step_out_of_context_synthesis [13/15]
Running step: step_synthesize_bitfile [14/15]
Running step: step_deployment_package [15/15]
Completed successfully


0

In [16]:
! ls {rtlsim_output_dir}/stitched_ip

all_verilog_srcs.txt		       ip
finn_vivado_stitch_proj.cache	       make_project.sh
finn_vivado_stitch_proj.hw	       make_project.tcl
finn_vivado_stitch_proj.ip_user_files  vivado.jou
finn_vivado_stitch_proj.srcs	       vivado.log
finn_vivado_stitch_proj.xpr


In [17]:
! ls {rtlsim_output_dir}/report

estimate_layer_resources_hls.json  rtlsim_performance.json
ooc_synth_and_timing.json


In [18]:
! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json

{
  "vivado_proj_folder": "/tmp/finn_dev_mtrahms/synth_out_of_context_to9kungs/results_finn_design_wrapper",
  "LUT": 4544.0,
  "FF": 2816.0,
  "DSP": 0.0,
  "BRAM": 0.0,
  "WNS": 190.033,
  "": 0,
  "fmax_mhz": 100.33109260559834,
  "estimated_throughput_fps": 231177.63273179342
}

In [19]:
! cat {rtlsim_output_dir}/report/rtlsim_performance.json

{
  "cycles": 3422,
  "runtime[ms]": 0.6844,
  "throughput[images/s]": 10227.936879018118,
  "DRAM_in_bandwidth[Mb/s]": 0.10739333722969023,
  "DRAM_out_bandwidth[Mb/s]": 0.007670952659263587,
  "fclk[mhz]": 5.0,
  "N": 7,
  "latency_cycles": 782
}

In [20]:
! cat {rtlsim_output_dir}/final_hw_config.json

{
  "Defaults": {},
  "StreamingFIFO_0": {
    "ram_style": "auto",
    "depth": 32,
    "impl_style": "rtl"
  },
  "StreamingFCLayer_Batch_0": {
    "PE": 1,
    "SIMD": 1,
    "ram_style": "auto",
    "resType": "lut",
    "mem_mode": "decoupled",
    "runtime_writeable_weights": 0
  },
  "StreamingFIFO_1": {
    "ram_style": "auto",
    "depth": 32,
    "impl_style": "rtl"
  },
  "StreamingFCLayer_Batch_1": {
    "PE": 1,
    "SIMD": 1,
    "ram_style": "auto",
    "resType": "lut",
    "mem_mode": "decoupled",
    "runtime_writeable_weights": 0
  },
  "StreamingFCLayer_Batch_2": {
    "PE": 1,
    "SIMD": 1,
    "ram_style": "auto",
    "resType": "lut",
    "mem_mode": "decoupled",
    "runtime_writeable_weights": 0
  },
  "StreamingFCLayer_Batch_3": {
    "PE": 1,
    "SIMD": 1,
    "ram_style": "auto",
    "resType": "lut",
    "mem_mode": "decoupled",
    "runtime_writeable_weights": 0
  },
  "Thresholding_Batch_0": {
    "PE": 1,
 

### Build PYNQ
Build a bitfile for a pynq board

In [21]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg

final_output_dir = "output_final"

cfg = build.DataflowBuildConfig(
    output_dir          = final_output_dir,
    target_fps          = target_fps,
    synth_clk_period_ns = clk_per_ns,
    board               = board,
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    generate_outputs=[
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ]
)

build.build_dataflow_cfg(model_file, cfg)

Building dataflow accelerator from quant_models/facileV3_4b_1_tidy.onnx
Intermediate outputs will be generated in /tmp/finn_dev_mtrahms
Final outputs will be generated in output_final
Build log is at output_final/build_dataflow.log
Running step: step_tidy_up [1/15]
Running step: step_streamline [2/15]
Running step: step_convert_to_hls [3/15]
Running step: step_create_dataflow_partition [4/15]
Running step: step_target_fps_parallelization [5/15]
Running step: step_apply_folding_config [6/15]
Running step: step_generate_estimate_reports [7/15]
Running step: step_hls_ipgen [8/15]
Running step: step_set_fifo_depths [9/15]
Running step: step_create_stitched_ip [10/15]
Running step: step_measure_rtlsim_performance [11/15]
Running step: step_make_pynq_driver [12/15]
Running step: step_out_of_context_synthesis [13/15]
Running step: step_synthesize_bitfile [14/15]
Running step: step_deployment_package [15/15]
Completed successfully


0