In [1]:
from fastfusion import arch
from fastfusion import constraints
from fastfusion import workload
import fastfusion

variables = {"tech_node": 1e-9}
global_cycle_period = 1e-9

main_memory = arch.Memory(
    name="MainMemory",
    component_class="DRAM",
    attributes=dict(
        _size=4*1024*1024*1024*8,
        _bandwidth_reads_plus_writes_per_cycle=614e9 / 8192 * global_cycle_period,
        _datawidth=8,
        width=8192,
        depth=4*1024*1024*1024*8 / 8192,
    ))
main_memory.constraints.tensors.keep = "~Intermediates()"

global_buffer = arch.Memory(
    name="GlobalBuffer",
    component_class="SRAM",
    attributes=dict(
        _size=1024*1024*128*8,
        _datawidth=8,
        _bandwidth_reads_plus_writes_per_cycle=1,
        width=16384,
        depth=1024*1024*128*8 / 16384,
    )
)
global_buffer.constraints.tensors.keep = "All()"
global_buffer.constraints.tensors.no_refetch_from_above = "All() if ~MainMemory.tensors() else Nothing()"
global_buffer.constraints.dataflow.tensor_order_options = [["MainMemory.tensors()", "~MainMemory.tensors()"]]

local_buffer = arch.Memory(
    name="LocalBuffer",
    component_class="SRAM",
    attributes=dict(
        _size=1024*1024*4*8,
        _datawidth=8,
        _bandwidth_reads_plus_writes_per_cycle=1,
        width=16384,
        depth=1024*1024*4*8 / 16384,
    ),
    spatial=[{"name": "Z", "fanout": 4}],
)
local_buffer.constraints.tensors.keep = "input | output"

local_buffer.constraints.dataflow.tensor_order_options = [["output", "input"]]
local_buffer.constraints.spatial.append(constraints.Spatial(
    name="Z",
    min_utilization="1 if len(All()) > 2 else 0",
    loop_bounds=[{"expression": "weight.rank_variables()", "operator": "==", "value": 1}]
))

register = arch.Memory(
    name="Register",
    component_class="dummy_storage",
    spatial=[{"name": "X", "fanout": 128}, {"name": "Y", "fanout": 128}],
    attributes=dict(
        _size=8,
        _datawidth=8,
    )
)
register.constraints.tensors.keep = "weight"
register.constraints.tensors.bypass = "~weight"
register.constraints.spatial.append(constraints.Spatial(
    name="X",
    min_utilization=1,
    loop_bounds=[{"expression": "input.rank_variables()", "operator": "==", "value": 1}]
))
register.constraints.spatial.append(constraints.Spatial(
    name="Y",
    min_utilization=1,
    loop_bounds=[{"expression": "output.rank_variables()", "operator": "==", "value": 1}]
))

compute = arch.Compute(
    name="MAC",
    component_class="intmac",
    attributes=dict(
        multiplier_width=8,
        adder_width=16,
    )
)
compute.constraints.misc.enabled = "len(All()) == 3"

scalar_unit = arch.Compute(
    name="scalar_unit",
    component_class="dummy_compute",
    attributes=dict(
        area=1,
        _computes_per_cycle=128
    )
)
scalar_unit.constraints.misc.enabled = "len(All()) == 2"


arch = arch.Arch(
    nodes=[
        main_memory,
        global_buffer,
        local_buffer,
        scalar_unit,
        register,
        compute,
    ],
    global_cycle_period=global_cycle_period,
)




acc0 = workload.TensorAccess(name="T0", projection=["m", "n0"])
acc1 = workload.TensorAccess(name="W0", projection=["n0", "n1"])
acc2 = workload.TensorAccess(name="T1", projection=["m", "n1"], output=True)
renames = {"input": "T0", "weight": "W0", "output": "T1"}
matmul1 = workload.Einsum(name="Matmul1", tensor_accesses=[acc0, acc1, acc2], renames=renames)

workload = workload.Workload(
    einsums=[matmul1],
    shape=dict(
        m="0 <= m < 128",
        n0="0 <= n0 < 64",
        n1="0 <= n1 < 128",
    ),
)

spec = fastfusion.Specification(
    arch=arch,
    workload=workload,
    variables=variables,
)
pmappings = fastfusion.mapper.FFM.make_pmappings(spec)

Generating pmapping templates for compute scalar_unit Einsum Matmul1: 0it [00:00, ?it/s]
Generating pmapping templates for compute MAC Einsum Matmul1: 18it [00:00, 63.39it/s]
Generating jobs: 100%|██████████| 2/2 [00:01<00:00,  1.57it/s]
Generating pmappings: 100%|██████████| 18/18 [00:04<00:00,  3.80it/s]


In [9]:
"""
The following code is adapted from main.py in the ZigZag repository.

To run the code, first install ZigZag with `pip3 install zigzag-dse`.
"""
from datetime import datetime
import time

from zigzag import api

workload_path = "workloads/zigzag/gemm_16k.yaml"
accelerator_path = "architectures/zigzag/tpu_like.yaml"
mapping_path = "mapping/zigzag/tpu_custom.yaml"
experiment_id = datetime.now()
dump_folder = f"outputs/zigzag/{experiment_id}"
pickle_filename = f"outputs/zigzag/{experiment_id}/cmes.pickle"

LPF_LIMIT = 7

start = time.time()
energy, latency, cmes = api.get_hardware_performance_zigzag(
    workload=workload_path,
    accelerator=accelerator_path,
    mapping=mapping_path,
    opt="energy",
    dump_folder=dump_folder,
    pickle_filename=pickle_filename,
    lpf_limit=LPF_LIMIT
)
end = time.time()
duration = end - start
print(f"Time = {duration:.2e} s")
print(f"Total energy = {energy:.2e} pJ")
print(f"Total latency = {latency:.2e} cycles")

2025-09-05 12:45:13,279 - run +49 - INFO - Processing  Layer0...
2025-09-05 12:45:13,280 - run +93 - INFO - Layer0: Launching spatial mapping 1/1: {D1: {C: 128}, D2: {K: 128}}.
100%|██████████| 2520/2520 [00:08<00:00, 283.24it/s]
2025-09-05 12:45:22,181 - run +48 - INFO - Saved CostModelEvaluation(Layer0, Core(0)) with energy 1.592e+12 and latency 5.536e+08 to outputs/zigzag/2025-09-05 12:45:13.262479/Layer0_complete.json
2025-09-05 12:45:22,214 - run +95 - INFO - Saved CumulativeCME with energy 1.592e+12 and latency 5.536e+08 to outputs/zigzag/2025-09-05 12:45:13.262479/overall_simple.json
2025-09-05 12:45:22,215 - run +150 - INFO - Saved pickled list of 1 CMEs to outputs/zigzag/2025-09-05 12:45:13.262479/cmes.pickle.


Time = 8.95e+00 s
Total energy = 1.59e+12 pJ
Total latency = 5.54e+08 cycles
