In [1]:
# Analysis of min-accesses required capacity

"""
TileFlow and LoopTree can only have one trunk, and all intermediate tensors
must be backed by the split.

Thus, two choices:

Choice 1:  Untiled fusion, weights can be tiled

DRAM
GLB -- Intermediates()
  Intra-Einsum mapping

  
Choice 2: Tiled fusion, weights cannot be tiled

DRAM
GLB -- Weights
for p of last layer
GLB -- Intermediates()
  Intra-Einsum mapping
"""

from fastfusion.frontend import Workload
from fastfusion.frontend.workload._isl import get_tensor_size

workload = Workload.from_yaml('workloads/mobilenet_7.workload.yaml')
einsums = workload.einsums

# In choice 1, long pole Einsum for capacity is Dwise1
choice_1_looptree_long_pole = sum(
    get_tensor_size(workload, tensor)
    for tensor in einsums['Dwise1'].tensor_names & workload.intermediate_tensor_names
)
print('Choice 1 LoopTree long pole:', choice_1_looptree_long_pole)

choice_1_tileflow_long_pole = sum(
    get_tensor_size(workload, tensor)
    for tensor in einsums['Dwise1'].tensor_names  # TileFlow has to be even
)
print('Choice 1 TileFlow long pole:', choice_1_tileflow_long_pole)


# In choice 2, long pole is keeping all weights
choice_2_long_pole = sum(
    get_tensor_size(workload, tensor)
    for tensor in ['WA0', 'WAB0', 'WB0', 'WA1', 'WAB1', 'WB1']
)

choice_2_long_pole += sum(
    get_tensor_size(workload, tensor)
    for tensor in ['T0', 'TA0']
) * 4/28

choice_2_long_pole += sum(
    get_tensor_size(workload, tensor)
    for tensor in ['TB0', 'T1', 'TA1']
) * 2/28

choice_2_long_pole += sum(
    get_tensor_size(workload, tensor)
    for tensor in ['TB1', 'T2']
) * 1/28 
print('Choice 2 long pole:', choice_2_long_pole)

INFO        Loading yaml file workloads/mobilenet_7.workload.yaml
INFO        Found top key workload in workloads/mobilenet_7.workload.yaml


Choice 1 LoopTree long pole: 392
Choice 1 TileFlow long pole: 428
Choice 2 long pole: 191.5


In [2]:
loopforest_long_pole = sum(
    get_tensor_size(workload, tensor)
    for tensor in ['WA0', 'WAB0', 'WB0']
)/192 + sum(
    get_tensor_size(workload, tensor)
    for tensor in ['T0', 'T1']
)

loopforest_long_pole

196.27083333333334

In [2]:
from snowcat_experiments import get_sims_with_cache

mappings_7_loopforest = get_sims_with_cache(arch_name='snowcat', workload_name='mobilenet_7')
# mappings_7_looptree = get_sims_with_cache(arch_name='snowcat', workload_name='mobilenet_7', tagger_name='one_split')
# mappings_7_tileflow = get_sims_with_cache(arch_name='snowcat_even', workload_name='mobilenet_7', tagger_name='one_split')

# mappings_28_loopforest = get_sims_with_cache(arch_name='snowcat', workload_name='mobilenet_28')
# mappings_28_looptree = get_sims_with_cache(arch_name='snowcat', workload_name='mobilenet_28', tagger_name='one_split')
# mappings_28_tileflow = get_sims_with_cache(arch_name='snowcat_even', workload_name='mobilenet_28', tagger_name='one_split')

# mappings_14_loopforest = get_sims_with_cache(arch_name='snowcat', workload_name='mobilenet_14')
# mappings_14_looptree = get_sims_with_cache(arch_name='snowcat', workload_name='mobilenet_14', tagger_name='one_split')
# mappings_14_tileflow = get_sims_with_cache(arch_name='snowcat_even', workload_name='mobilenet_14', tagger_name='one_split')

INFO        Loading yaml file architecture/snowcat.arch.yaml
INFO        Found top key variables in architecture/snowcat.arch.yaml
INFO        Found top key architecture in architecture/snowcat.arch.yaml
INFO        Loading yaml file workloads/mobilenet_7.workload.yaml
INFO        Found top key workload in workloads/mobilenet_7.workload.yaml


Loaded pmappings from results/sims/mobilenet_7.snowcat.None.pmappings.pkl


Compressing pmappings: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:09<00:00,  1.63s/it]


SIM PwiseA0 tensors: {'TA0'}
SIM Dwise0 tensors: {'TA0', 'TB0'}
SIM PwiseB0 tensors: {'TB0', 'T1'}
SIM PwiseA1 tensors: {'TA1', 'T1'}
SIM Dwise1 tensors: {'TB1', 'TA1'}
SIM PwiseB2 tensors: {'TB1'}


Grouping Partial Mappings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [00:08<00:00, 10.55it/s]
Grouping Partial Mappings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 203.42it/s]
Grouping Partial Mappings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 332.14it/s]
Grouping Partial Mappings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [00:01<00:00, 59.56it/s]
Grouping Partial Mappings: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

Initial consolidate and group: 11.37 seconds

Einsum Dwise0 (2/6)


Grouping Partial Mappings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 299.55it/s]
Merging mappings PwiseA0 <--> Dwise0:   0%|                                                                                                                                                | 0/35 [00:00<?, ?it/s]

Mapping merging: 0.16 seconds
	Combining 54(25) x 38(30) -> 35
	Number of groups for Einsum Dwise0: 35
	Number of mappings for Einsum Dwise0: 35
	Mappings per group for Einsum Dwise0: 1.0
	Largest left: 1
	Largest right: 1

Einsum PwiseB0 (3/6)



Grouping Partial Mappings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 365.30it/s][A

Merging mappings PwiseA0 <--> Dwise0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 334.47it/s][A


Mapping merging: 0.13 seconds
	Combining 54(25) x 62(34) -> 61
	Number of groups for Einsum PwiseB0: 61
	Number of mappings for Einsum PwiseB0: 61
	Mappings per group for Einsum PwiseB0: 1.0
	Largest left: 1
	Largest right: 1

Einsum PwiseA1 (4/6)


Grouping Partial Mappings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 210.51it/s]
Merging mappings Dwise0 <--> PwiseB0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61/61 [00:00<00:00, 501.95it/s]


Mapping merging: 0.06 seconds
	Combining 33(16) x 41(25) -> 10
	Number of groups for Einsum PwiseA1: 10
	Number of mappings for Einsum PwiseA1: 10
	Mappings per group for Einsum PwiseA1: 1.0
	Largest left: 1
	Largest right: 1

Einsum Dwise1 (5/6)



Grouping Partial Mappings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 204.13it/s][A

Merging mappings PwiseB0 <--> PwiseA1: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 218.19it/s][A


Mapping merging: 0.07 seconds
	Combining 2(3) x 38(30) -> 18
	Number of groups for Einsum Dwise1: 18
	Number of mappings for Einsum Dwise1: 18
	Mappings per group for Einsum Dwise1: 1.0
	Largest left: 1
	Largest right: 1

Einsum PwiseB2 (6/6)


Grouping Partial Mappings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 168.94it/s]
Merging mappings PwiseA1 <--> Dwise1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 243.27it/s]


Mapping merging: 0.05 seconds
	Combining 40(16) x 54(25) -> 16
	Number of groups for Einsum PwiseB2: 16
	Number of mappings for Einsum PwiseB2: 16
	Mappings per group for Einsum PwiseB2: 1.0
	Largest left: 1
	Largest right: 1



Final consolidate: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 603.49it/s][A

Grouping Partial Mappings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 192.58it/s][A
Merging mappings Dwise1 <--> PwiseB2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 207.76it/s]



Initial consolidate and group: 11.37 seconds
Mapping merging: 0.47 seconds

Total: 11.85 seconds

Merging with shared loops 2: Dwise1 <--> PwiseB2.
Merging with shared loops 1: Dwise0 <--> PwiseB0.
Merging with shared loops 1: PwiseA1 <--> Dwise1,PwiseB2.
Merging with shared loops 0: PwiseA0 <--> Dwise0,PwiseB0.
Merging with shared loops 0: PwiseA0,Dwise0,PwiseB0 <--> PwiseA1,Dwise1,PwiseB2.
Saved results to cache results/data/mobilenet_7.snowcat.None.pkl


In [9]:
from fastfusion.frontend import Workload
from fastfusion.visualization.interactive import plotly_show

workload = Workload.from_yaml('workloads/mobilenet_7.workload.yaml')
print(mappings_7_loopforest.data.columns)
plotly_show(mappings_7_loopforest.datamappings_7_loopforest.data, "RESOURCE_GlobalBuffer_LEVEL_0", "Total_Energy", logscales=True, einsum_names=workload.einsum_names)

INFO        Loading yaml file workloads/mobilenet_7.workload.yaml
INFO        Found top key workload in workloads/mobilenet_7.workload.yaml


Index(['PwiseB2___COMPRESSED_INDEX', 'Dwise1___COMPRESSED_INDEX',
       'PwiseA1___COMPRESSED_INDEX', 'PwiseB0___COMPRESSED_INDEX',
       'Dwise0___COMPRESSED_INDEX', 'PwiseA0___COMPRESSED_INDEX',
       'Total_Energy', 'PwiseA0___COMPRESSED_INDEX_x',
       'Dwise0___COMPRESSED_INDEX_x', 'PwiseB0___COMPRESSED_INDEX_x',
       'PwiseA1___COMPRESSED_INDEX_x', 'Dwise1___COMPRESSED_INDEX_x',
       'PwiseB2___COMPRESSED_INDEX_x', 'PwiseA0___tile_shape0',
       'PwiseA0___tile_shape1', 'PwiseA0___tile_shape2',
       'PwiseA0___tile_shape3', 'PwiseA0___tile_shape4',
       'PwiseA0___tile_shape5', 'PwiseA0___tile_shape6', 'PwiseA0___MAPPING',
       'PwiseA0___COMPRESSED_INDEX_y', 'PwiseA0___tile_shape7',
       'PwiseA0___tile_shape8', 'Dwise0___tile_shape0', 'Dwise0___tile_shape1',
       'Dwise0___tile_shape2', 'Dwise0___tile_shape3', 'Dwise0___tile_shape4',
       'Dwise0___tile_shape5', 'Dwise0___tile_shape6', 'Dwise0___tile_shape7',
       'Dwise0___MAPPING', 'Dwise0___COMPRESSED_

AttributeError: 'PmappingGroup' object has no attribute 'datamappings_7_loopforest'

In [None]:
plotly_show(mappings_28_looptree.data, "RESOURCE_GlobalBuffer_LEVEL_0", "Total_Energy", logscales=True, einsum_names=workload.einsum_names)

In [5]:
plotly_show(mappings_28_tileflow.data, "RESOURCE_GlobalBuffer_LEVEL_0", "Total_Energy", logscales=True, einsum_names=workload.einsum_names)

VBox(children=(FigureWidget({
    'data': [{'line': {'shape': 'hv'},
              'marker': {'symbol': 'circl…

In [22]:
from geomean import continuous_gm
from fastfusion.accelerated_imports import np

data = mappings_28_loopforest.data[["RESOURCE_GlobalBuffer_LEVEL_0", "Total_Energy"]].to_numpy().T
looptree_baseline = mappings_28_looptree.data[["RESOURCE_GlobalBuffer_LEVEL_0", "Total_Energy"]].to_numpy().T
looptree_baseline[0,-1] = 91488
print(continuous_gm(looptree_baseline, data))

NameError: name 'mappings_28_looptree' is not defined

In [26]:
data = mappings_28_loopforest.data[["RESOURCE_GlobalBuffer_LEVEL_0", "Total_Energy"]].to_numpy().T
sort_mask = np.argsort(data[0,:])
data = data[:,sort_mask]

tileflow_baseline = mappings_28_tileflow.data[["RESOURCE_GlobalBuffer_LEVEL_0", "Total_Energy"]].to_numpy().T
sort_mask = np.argsort(tileflow_baseline[0,:])
tileflow_baseline = tileflow_baseline[:,sort_mask]

# tileflow_baseline[0,-1] = 91488
print(continuous_gm(tileflow_baseline, data))

(np.float64(1.3453674382492102), np.float64(1.9780439121756488))
