In [1]:
import fastfusion
from fastfusion.util import set_n_parallel_jobs
from fastfusion.mapper import Metrics
import time

import csv

from IPython.display import display, SVG

from paths import ARCH_DIR, WORKLOADS_DIR

In [2]:
def make_spec(arch_fname, workload_fname):
    spec = fastfusion.Specification.from_yaml(
        ARCH_DIR / arch_fname,
        WORKLOADS_DIR / workload_fname,
    )
    spec.mapper.ffm.metrics = Metrics.ENERGY | Metrics.LATENCY
    # spec.mapper.ffm.max_fused_loops = 3
    spec.mapper.ffm.max_fused_loops_per_rank_variable = 1
    return spec

def run_make_pmappings(spec):
    start = time.time()
    pmappings = fastfusion.mapper.FFM.make_pmappings(spec)
    end = time.time()
    duration = end - start
    return pmappings, duration

def run_join_pmappings(spec, pmappings):
    start = time.time()
    mappings = fastfusion.mapper.FFM.join_pmappings(spec, pmappings)
    end = time.time()
    duration = end - start
    return mappings, duration

In [3]:
spec = make_spec("snowcat.arch.yaml", "mobilenet_7.yaml")



In [4]:
set_n_parallel_jobs(12)
pmappings, make_duration = run_make_pmappings(spec)

Generating pmapping templates for compute MAC Einsum PwiseB2: 12it [00:00, 74.45it/s]
Generating pmapping templates for compute MAC Einsum PwiseA0: 12it [00:00, 71.63it/s]
Generating pmapping templates for compute MAC Einsum PwiseA1: 24it [00:00, 111.29it/s]
Generating pmapping templates for compute MAC Einsum PwiseB0: 24it [00:00, 106.29it/s]
Generating pmapping templates for compute MAC Einsum Dwise1: 126it [00:00, 276.13it/s]
Generating pmapping templates for compute MAC Einsum Dwise0: 126it [00:00, 289.02it/s]
Generating jobs: 100%|██████████| 6/6 [00:01<00:00,  3.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["fused_loop_indices"] = get_fused_loop_indices(
Generating pmappings: 100%|██████████| 92/92 [00:17<00:00,  5.13it/s]
Grouping pmappings for PwiseA0:

Dwise1: 6.12e04 total, 3.84e04 (1/2) valid, 8.45e04 (1/1) evaluated, 1.38e03 (1/45) Pareto-Optimal
Dwise0: 1.34e05 total, 7.85e04 (1/2) valid, 1.31e05 (1/1) evaluated, 3.70e03 (1/36) Pareto-Optimal
PwiseA0: 8.05e03 total, 3.71e03 (1/2) valid, 4.58e03 (1/2) evaluated, 621 (1/13) Pareto-Optimal
PwiseB0: 1.15e04 total, 6.33e03 (1/2) valid, 9.06e03 (1/1) evaluated, 1.58e03 (1/7) Pareto-Optimal
PwiseA1: 1.15e04 total, 6.33e03 (1/2) valid, 9.06e03 (1/1) evaluated, 1.58e03 (1/7) Pareto-Optimal
PwiseB2: 8.18e03 total, 3.84e03 (1/2) valid, 4.71e03 (1/2) evaluated, 621 (1/13) Pareto-Optimal
Total: 2.35e05 total, 1.37e05 (1/2) valid, 2.42e05 (1/1) evaluated, 9.47e03 (1/25) Pareto-Optimal





In [5]:
import pickle

with open("pmappings.pkl", "wb") as f:
    pickle.dump(pmappings, f)

In [6]:
for sims in pmappings.einsum2pmappings.values():
    for sim in sims:
        data = sim.mappings._data
        if not (data["Total<SEP>energy"] > 0).all():
            iteration_cols = [c for c in data.columns if 'n_iterations' in c]
            print(data["Total<SEP>energy"])
            print(data[iteration_cols])
            iteration_cols = [c for c in data.columns if 'stride' in c]
            print(data[iteration_cols])
            iteration_cols = [c for c in data.columns if 'initial' in c]
            print(data[iteration_cols])

In [7]:
# import pickle
# with open("pmappings.pkl", "rb") as f:
#     pmappings = pickle.load(f)
mappings, join_duration = run_join_pmappings(spec, pmappings)

Einsum PwiseA0 has 621 pmappings with 37 compatibilities
Einsum Dwise0 has 3696 pmappings with 178 compatibilities
Einsum PwiseB0 has 1577 pmappings with 129 compatibilities
Einsum PwiseA1 has 1577 pmappings with 129 compatibilities
Einsum Dwise1 has 1376 pmappings with 84 compatibilities
Einsum PwiseB2 has 621 pmappings with 37 compatibilities


Compressing pmappings: 100%|██████████| 6/6 [00:00<00:00, 19.91it/s]


Not tracking GlobalBuffer because it is never reserved for multiple pmappings.
Not tracking MainMemory because it is never reserved for multiple pmappings.


Removing unneeded reservations for PwiseA0: 100%|██████████| 74/74 [00:00<00:00, 2180.97it/s]
Removing unneeded reservations for Dwise0: 100%|██████████| 356/356 [00:00<00:00, 2767.14it/s]
Removing unneeded reservations for PwiseB0: 100%|██████████| 258/258 [00:00<00:00, 2218.23it/s]
Removing unneeded reservations for PwiseA1: 100%|██████████| 258/258 [00:00<00:00, 2416.21it/s]
Removing unneeded reservations for Dwise1: 100%|██████████| 168/168 [00:00<00:00, 372.93it/s]
Removing unneeded reservations for PwiseB2: 100%|██████████| 74/74 [00:00<00:00, 2284.70it/s]
Grouping pmappings for Dwise0 (2/6): 100%|██████████| 178/178 [00:00<00:00, 1337.29it/s]
Grouping pmappings for PwiseB0 (3/6): 100%|██████████| 129/129 [00:00<00:00, 1595.67it/s]
Grouping pmappings for PwiseA1 (4/6): 100%|██████████| 129/129 [00:00<00:00, 1583.49it/s]
Grouping pmappings for Dwise1 (5/6): 100%|██████████| 84/84 [00:00<00:00, 1356.31it/s]
Grouping pmappings for PwiseB2 (6/6): 100%|██████████| 37/37 [00:00<00:00, 

In [9]:
data = mappings.data
for a, b in data.iloc[0].items():
    print(a, b)

tensor<SEP>TA0 49
Total<SEP>latency 0.03847872
Total<SEP>energy 808800.0
tensor<SEP>TB0 47040
tensor<SEP>T1 1
tensor<SEP>TA1 47040
tensor<SEP>TB1 49
PwiseA0<SEP>stride0 7
PwiseA0<SEP>stride1 7
PwiseA0<SEP>stride2 960
PwiseA0<SEP>stride3 1.0
PwiseA0<SEP>stride4 7.0
PwiseA0<SEP>stride5 7.0
PwiseA0<SEP>latency<SEP>compute 0.0075264
PwiseA0<SEP>first_latency<SEP>MAC<SEP>13 7840.0
PwiseA0<SEP>latency<SEP>GlobalBuffer 0
PwiseA0<SEP>latency<SEP>MainMemory 0
PwiseA0<SEP>energy<SEP>GlobalBuffer<SEP>read 0
PwiseA0<SEP>energy<SEP>GlobalBuffer<SEP>write 0
PwiseA0<SEP>energy<SEP>MainMemory<SEP>read 161440.0
PwiseA0<SEP>energy<SEP>MainMemory<SEP>write 0
PwiseA0<SEP>energy<SEP>MAC<SEP>compute 0
PwiseA0<SEP>energy<SEP>MainMemory<SEP>leak 0
PwiseA0<SEP>energy<SEP>GlobalBuffer<SEP>leak 0
PwiseA0<SEP>energy<SEP>MAC<SEP>leak 0
PwiseA0<SEP>n_iterations<SEP>0 1.0
PwiseA0<SEP>lower_iterations<SEP>0 0
PwiseA0<SEP>stride<SEP>P0<SEP>0 7
PwiseA0<SEP>initial<SEP>P0<SEP>0 7
PwiseA0<SEP>n_iterations<SEP>1 1.0
Pwise