In [1]:
import pandas as pd
import numpy as np
from loaders import *

# TETRIS arch and mapping: with and without accumulation at DRAM

In [2]:
show_config(ConfigRegistry.TETRIS_ARCH_DRAM_RED)
show_config(ConfigRegistry.TETRIS_COMPONENTS_DIR)
result = run_accelergy(ConfigRegistry.TETRIS_ARCH_DRAM_RED, ConfigRegistry.TETRIS_COMPONENTS_DIR)
print(result.ert)

architecture:
  # Architecture Description
  version: 0.3
  subtree:
    - name: system
      local:
        - name: 3D_vault_stack
          class: DRAM
          attributes:
            type: HBM2 # an assumption that Hybrid Memory Cube has same relative cost as HBM2, Cacti plug-in 
            width: 32 # each vault is 32 bit wide
            block-size: 2
            word-bits: 16
            reduction_supported: yes # flag for having digital accumulation logic processing near DRAM
      subtree:
        - name: eyeriss-modified
          attributes:
            technology: 45nm
          local:
            - name: shared_glb
              class: smartbuffer_SRAM
              attributes:
                memory_depth: 33250 # 133kB buffer with 32 bit lines
                memory_width: 32
                block-size: 2
                word-bits: 16
                read_bandwidth: 16
                write_bandwidth: 16
            - name: DummyBuffer[0..13] # for better mapping
     

In [3]:
show_config(ConfigRegistry.TETRIS_ARCH_NO_DRAM_RED)
show_config(ConfigRegistry.TETRIS_COMPONENTS_DIR)
result = run_accelergy(ConfigRegistry.TETRIS_ARCH_NO_DRAM_RED, ConfigRegistry.TETRIS_COMPONENTS_DIR)
print(result.ert)

architecture:
  # Architecture Description
  version: 0.3
  subtree:
    - name: system
      local:
        - name: 3D_vault_stack
          class: DRAM
          attributes:
            type: HBM2 # an assumption that Hybrid Memory Cube has same relative cost as HBM2, Cacti plug-in 
            width: 32 # each vault is 32 bit wide
            block-size: 2
            word-bits: 16
            reduction_supported: no # flag for having digital accumulation logic processing near DRAM
      subtree:
        - name: eyeriss-modified
          attributes:
            technology: 45nm
          local:
            - name: shared_glb
              class: smartbuffer_SRAM
              attributes:
                memory_depth: 33250 # 133kB buffer with 32 bit lines
                memory_width: 32
                block-size: 2
                word-bits: 16
                read_bandwidth: 16
                write_bandwidth: 16
            - name: DummyBuffer[0..13] # for better mapping
      

### VGG02_layer5

In [8]:
example_layer_stats, example_layer_mapping = run_timeloop_mapper(
    ConfigRegistry.TETRIS_ARCH_DRAM_RED, ConfigRegistry.TETRIS_COMPONENTS_DIR,
    ConfigRegistry.TETRIS_ARCH_CONSTRAINTS,
    ConfigRegistry.TETRIS_MAP_CONSTRAINTS,
    ConfigRegistry.VGG02_layer5, ConfigRegistry.DEFAULT_MAPPER_SETTING
)

input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 120
  Factorization options along problem dimension M = 495
  Factorization options along problem dimension R = 1
  Factorization options along problem dimension S = 1
  Factorization o

[  0] Utilization = 0.06 | pJ/Compute =   82.623 | L5[WIO] Q56 M4 - L4[] M16 P56 C128 M2X - L3[] Q1 M2Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] Q1 
[  0] Utilization = 0.06 | pJ/Compute =   72.253 | L5[WIO] Q56 M4 - L4[I] M16 P56 C128 M2X - L3[] Q1 M2Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] Q1 
[  4] Utilization = 0.49 | pJ/Compute =   19.028 | L5[WIO] Q28 M4 C128 - L4[] M2 P56 M4X Q2X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] M2 
[  3] Utilization = 0.12 | pJ/Compute =   44.192 | L5[WIO] Q7 M64 C8 - L4[] M2 P56 C8 Q8X - L3[] Q1 S3Y - L2[I] Q1 - L1[W] R3 C2 - L0[O] M2 
[  7] Utilization = 0.12 | pJ/Compute =   53.064 | L5[WIO] Q14 M8 C16 - L4[] M16 P56 C8 M2X Q4X - L3[] Q1 S3Y - L2[I] Q1 - L1[W] R3 - L0[O] Q1 
[  4] Utilization = 0.49 | pJ/Compute =   18.855 | L5[WIO] Q28 M4 C128 - L4[I] M2 P56 M4X Q2X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] M2 
[  7] Utilization = 0.12 | pJ/Compute =   42.935 | L5[WIO] Q14 M8 C16 - L4[W] M16 P56 C8 M2X Q4X - L3[] Q1 S3Y - L2[I] Q1 - L1[W] R3 - L0[O

[  6] Utilization = 0.86 | pJ/Compute =   19.712 | L5[WIO] Q8 M4 C32 - L4[] M4 P56 C4 M2X Q7X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] M2 
[  6] Utilization = 0.86 | pJ/Compute =   17.273 | L5[WIO] Q8 M4 C32 - L4[I] M4 P56 C4 M2X Q7X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] M2 
[  6] Utilization = 0.86 | pJ/Compute =   14.898 | L5[WIO] Q8 M4 C32 - L4[IO] M4 P56 C4 M2X Q7X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] M2 
[  2] Utilization = 0.86 | pJ/Compute =   15.887 | L5[WIO] Q4 C2 - L4[] M32 P56 C64 Q14X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] M2 
[  7] Utilization = 0.86 | pJ/Compute =   25.328 | L5[WIO] Q4 M4 C16 - L4[] M16 P56 C8 Q14X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] Q1 
[  7] Utilization = 0.86 | pJ/Compute =   22.434 | L5[WIO] Q4 M4 C16 - L4[W] M16 P56 C8 Q14X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W] R3 - L0[O] Q1 
[  7] Utilization = 0.86 | pJ/Compute =   21.298 | L5[WIO] Q4 M4 C16 - L4[WO] M16 P56 C8 Q14X - L3[] Q1 M4Y S3Y - L2[I] Q1 - L1[W



Summary stats for best mapping found by mapper:
  Utilization = 0.86 | pJ/Compute =    5.395


[  0] STATEMENT: 500 suboptimal mappings found since the last upgrade, terminating search.


In [9]:
print(example_layer_mapping)


3D_vault_stack [ Weights:294912 (294912) Inputs:430592 (430592) Outputs:802816 (802816) ] 
-----------------------------------------------------------------------------------------
| for Q in [0:8)
|   for M in [0:2)
|     for C in [0:32)

shared_glb [ Inputs:2088 (2088) Outputs:50176 (50176) ] 
-------------------------------------------------------
|       for M in [0:16)
|         for P in [0:56)
|           for M in [0:2) (Spatial-X)
|             for Q in [0:7) (Spatial-X)

DummyBuffer [ ] 
---------------
|               for Q in [0:1)
|                 for M in [0:4) (Spatial-Y)
|                   for S in [0:3) (Spatial-Y)

ifmap_spad [ Inputs:12 (12) ] 
-----------------------------
|                     for Q in [0:1)

weights_spad [ Weights:12 (12) ] 
--------------------------------
|                       for R in [0:3)
|                         for C in [0:4)

psum_spad [ Outputs:1 (1) ] 
---------------------------
|                           for Q in [0:1)




In [10]:
print(example_layer_stats)

Buffer and Arithmetic Levels
----------------------------
Level 0
-------
=== mac ===

    SPECS
    -----
    Word bits             : 16
    Instances             : 196 (14*14)
    Compute energy        : 2.20 pJ

    STATS
    -----
    Utilized instances      : 168
    Computes (total)        : 924844032
    Cycles                  : 5505024
    Energy (total)          : 2034980565.81 pJ
    Area (total)            : 242942.00 um^2

Level 1
-------
=== psum_spad ===

    SPECS
    -----
        Technology                  : SRAM
        Size                        : 16
        Word bits                   : 16
        Block size                  : 1
        Cluster size                : 2
        Instances                   : 196 (14*14)
        Shared bandwidth            : -
        Read bandwidth              : 2.00
        Write bandwidth             : 2.00
        Multiple buffering          : 1.00
        Effective size              : 16
        Min utilization             : 0.