In [1]:
import pandas as pd
import numpy as np
from loaders import *
from energy_helpers import get_energy
import torch
import torch.nn as nn

In [8]:
class HamidaEtAl(nn.Module):
    """
    3-D Deep Learning Approach for Remote Sensing Image Classification
    Amina Ben Hamida, Alexandre Benoit, Patrick Lambert, Chokri Ben Amar
    IEEE TGRS, 2018
    https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8344565
    """
    @staticmethod
    def weight_init(m):
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv3d):
            init.kaiming_normal_(m.weight)
            init.zeros_(m.bias)

    def __init__(self, input_channels, n_classes, patch_size=5, dilation=1):
        super(HamidaEtAl, self).__init__()
        # The first layer is a (3,3,3) kernel sized Conv characterized
        # by a stride equal to 1 and number of neurons equal to 20
        self.patch_size = patch_size
        self.input_channels = input_channels
        dilation = (dilation, 1, 1)

        if patch_size == 3:
            self.conv1 = nn.Conv3d(
                1, 20, (3, 3, 3), stride=(1, 1, 1), dilation=dilation, padding=1)
        else:
            self.conv1 = nn.Conv3d(
                1, 20, (3, 3, 3), stride=(1, 1, 1), dilation=dilation, padding=0)
        # Next pooling is applied using a layer identical to the previous one
        # with the difference of a 1D kernel size (1,1,3) and a larger stride
        # equal to 2 in order to reduce the spectral dimension
        self.pool1 = nn.Conv3d(
            20, 20, (3, 1, 1), dilation=dilation, stride=(2, 1, 1), padding=(1, 0, 0))
        # Then, a duplicate of the first and second layers is created with
        # 35 hidden neurons per layer.
        self.conv2 = nn.Conv3d(
            20, 75, (3, 3, 3), dilation=dilation, stride=(1, 1, 1), padding=(1, 0, 0))
        self.pool2 = nn.Conv3d(
            75, 75, (3, 1, 1), dilation=dilation, stride=(2, 1, 1), padding=(1, 0, 0))
        # Finally, the 1D spatial dimension is progressively reduced
        # thanks to the use of two Conv layers, 35 neurons each,
        # with respective kernel sizes of (1,1,3) and (1,1,2) and strides
        # respectively equal to (1,1,1) and (1,1,2)
        self.conv3 = nn.Conv3d(
            75, 75, (3, 1, 1), dilation=dilation, stride=(1, 1, 1), padding=(1, 0, 0))
        self.conv4 = nn.Conv3d(
            75, 75, (2, 1, 1), dilation=dilation, stride=(2, 1, 1), padding=(1, 0, 0))

        #self.dropout = nn.Dropout(p=0.5)

        self.features_size = self._get_final_flattened_size()
        # The architecture ends with a fully connected layer where the number
        # of neurons is equal to the number of input classes.
        self.fc = nn.Linear(self.features_size, n_classes)

        # self.apply(self.weight_init)

    def _get_final_flattened_size(self):
        with torch.no_grad():
            x = torch.zeros((1, 1, self.input_channels,
                             self.patch_size, self.patch_size))
            x = self.pool1(self.conv1(x))
            x = self.pool2(self.conv2(x))
            x = self.conv3(x)
            x = self.conv4(x)
            _, t, c, w, h = x.size()
        return t * c * w * h

    def forward(self, x):

        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = x.view(-1, self.features_size)
        #x = self.dropout(x)
        x = self.fc(x)
        return x

In [9]:
# Hamida 3D
model = HamidaEtAl(input_channels=220, n_classes=16)
x = torch.rand(16, 1, 220, 5, 5)

types, energy, l_energy, mac, l_mac, param, l_param, cycle, l_cycle = get_energy(model, x, verbose=False)
print("layer types:", types)
print("energy:", energy, f"(layerwise: {l_energy})")
print("mac:", mac, f"(layerwise: {l_mac})")
print("param:", param, f"(layerwise: {l_param})")
print("cycle:", cycle, f"(layerwise: {l_cycle})")

HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_0.yaml _layers/map_0.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension R = 1
  Fac

[  0] Utilization = 1.00 | pJ/Compute =  281.245 | L5[WIO] F3 Q3 P218 - L4[] M2 T3 S3 R3 N16X - L3[] M10 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  0] Utilization = 1.00 | pJ/Compute =  270.760 | L5[WIO] F3 Q3 P218 - L4[I] M2 T3 S3 R3 N16X - L3[] M10 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  0] Utilization = 1.00 | pJ/Compute =   35.260 | L5[WIO] F3 Q3 P218 - L4[] M2 T3 S3 R3 N16X - L3[O] M10 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  0] Utilization = 1.00 | pJ/Compute =   24.774 | L5[WIO] F3 Q3 P218 - L4[I] M2 T3 S3 R3 N16X - L3[O] M10 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  3] Utilization = 1.00 | pJ/Compute =  149.866 | L5[WIO] F3 Q3 P218 - L4[] M20 T3 S3 R3 N16X - L3[] N1 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  3] Utilization = 1.00 | pJ/Compute =   25.390 | L5[WIO] F3 Q3 P218 - L4[I] M20 T3 S3 R3 N16X - L3[] N1 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  5] Utilization = 1.00 | pJ/Compute =  149.866 | L5[WIO] M2 F3 Q3 P218 - L4[] M10 T3 S3 R3 N16X - L3[] N1 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  5] Utilization =



Summary stats for best mapping found by mapper:
  Utilization = 1.00 | pJ/Compute =   24.774
419.96 16951680 1059480
HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_1.yaml _layers/map_1.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configu

[  0] Utilization = 1.00 | pJ/Compute =  283.873 | L5[WIO] C20 F3 Q3 P109 - L4[] M2 S3 N16X - L3[] M10 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  0] Utilization = 1.00 | pJ/Compute =  275.965 | L5[WIO] C20 F3 Q3 P109 - L4[W] M2 S3 N16X - L3[] M10 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  4] STATEMENT: 1500 invalid mappings (1492 fanout, 8 capacity) found since the last valid mapping, terminating search.
[  2] STATEMENT: 1500 invalid mappings (1492 fanout, 8 capacity) found since the last valid mapping, terminating search.
[  1] Utilization = 1.00 | pJ/Compute =  335.463 | L5[WIO] M2 C20 F3 Q3 P109 - L4[] M5 S3 N16X - L3[] M2 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  0] Utilization = 1.00 | pJ/Compute =  269.780 | L5[WIO] C20 F3 Q3 P109 - L4[WI] M2 S3 N16X - L3[] M10 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  1] Utilization = 1.00 | pJ/Compute =  327.555 | L5[WIO] M2 C20 F3 Q3 P109 - L4[W] M5 S3 N16X - L3[] M2 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  0] Utilization = 1.00 | pJ/Compute =  113.823 | L5[WIO] C20 F3



Summary stats for best mapping found by mapper:
  Utilization = 1.00 | pJ/Compute =   27.561
519.12 18835200 1177200
HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_2.yaml _layers/map_2.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configu

[  2] Utilization = 1.00 | pJ/Compute =  298.681 | L5[WIO] M3 C20 P109 - L4[] M5 T3 S3 R3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  275.110 | L5[WIO] M3 C20 P109 - L4[I] M5 T3 S3 R3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =   52.705 | L5[WIO] M3 C20 P109 - L4[] M5 T3 S3 R3 N16X - L3[O] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =   29.133 | L5[WIO] M3 C20 P109 - L4[I] M5 T3 S3 R3 N16X - L3[O] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  3] Utilization = 1.00 | pJ/Compute =  315.878 | L5[WIO] M5 C20 P109 - L4[] M5 T3 S3 R3 N16X - L3[] M3 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  3] Utilization = 1.00 | pJ/Compute =  308.036 | L5[WIO] M5 C20 P109 - L4[W] M5 T3 S3 R3 N16X - L3[] M3 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  3] Utilization = 1.00 | pJ/Compute =  276.592 | L5[WIO] M5 C20 P109 - L4[I] M5 T3 S3 R3 N16X - L3[] M3 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  3] Utilization = 1



Summary stats for best mapping found by mapper:
  Utilization = 1.00 | pJ/Compute =   23.052
1628.2 70632000 4414500
HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_3.yaml _layers/map_3.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configu

[  2] Utilization = 1.00 | pJ/Compute =  298.347 | L5[WIO] M3 C75 P55 - L4[] M5 S3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  290.578 | L5[WIO] M3 C75 P55 - L4[W] M5 S3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  278.200 | L5[WIO] M3 C75 P55 - L4[I] M5 S3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  270.431 | L5[WIO] M3 C75 P55 - L4[WI] M5 S3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  128.300 | L5[WIO] M3 C75 P55 - L4[] M5 S3 N16X - L3[O] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  120.532 | L5[WIO] M3 C75 P55 - L4[W] M5 S3 N16X - L3[O] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  7] Utilization = 1.00 | pJ/Compute =  315.543 | L5[WIO] M25 C75 P55 - L4[] S3 N16X - L3[] M3 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  7] Utilization = 1.00 | pJ/Compute =  307.775 | L5[WIO] M25 C75 P55 

[  2] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  4] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  7] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  5] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  6] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  3] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  1] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.




Summary stats for best mapping found by mapper:
  Utilization = 1.00 | pJ/Compute =   23.381
347.21 14850000 928125
HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_4.yaml _layers/map_4.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configur

[  2] Utilization = 1.00 | pJ/Compute =  298.347 | L5[WIO] M3 C75 P55 - L4[] M5 S3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  290.578 | L5[WIO] M3 C75 P55 - L4[W] M5 S3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  278.200 | L5[WIO] M3 C75 P55 - L4[I] M5 S3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  270.431 | L5[WIO] M3 C75 P55 - L4[WI] M5 S3 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  5] Utilization = 1.00 | pJ/Compute =  277.711 | L5[WIO] M3 C75 P55 - L4[] S3 N16X - L3[] M25 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  5] Utilization = 1.00 | pJ/Compute =  269.942 | L5[WIO] M3 C75 P55 - L4[W] S3 N16X - L3[] M25 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  128.300 | L5[WIO] M3 C75 P55 - L4[] M5 S3 N16X - L3[O] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  120.532 | L5[WIO] M3 C75 P55 - L4

[  2] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  4] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  7] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  5] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  3] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  6] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  1] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.




Summary stats for best mapping found by mapper:
  Utilization = 1.00 | pJ/Compute =   23.381
347.21 14850000 928125
HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_5.yaml _layers/map_5.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configur

[  2] Utilization = 1.00 | pJ/Compute =  298.060 | L5[WIO] M3 C75 P28 - L4[] M5 S2 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  6] Utilization = 1.00 | pJ/Compute =  315.257 | L5[WIO] C75 P28 - L4[] M25 S2 N16X - L3[] M3 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  290.433 | L5[WIO] M3 C75 P28 - L4[W] M5 S2 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  277.913 | L5[WIO] M3 C75 P28 - L4[I] M5 S2 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  2] Utilization = 1.00 | pJ/Compute =  270.287 | L5[WIO] M3 C75 P28 - L4[WI] M5 S2 N16X - L3[] M5 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  5] Utilization = 1.00 | pJ/Compute =  277.424 | L5[WIO] M3 C75 P28 - L4[] S2 N16X - L3[] M25 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  5] Utilization = 1.00 | pJ/Compute =  269.797 | L5[WIO] M3 C75 P28 - L4[W] S2 N16X - L3[] M25 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  7] Utilization = 1.00 | pJ/Compute =  315.257 | L5[WIO] M25 C75 P28 - L4[]

[  6] Utilization = 1.00 | pJ/Compute =   26.467 | L5[WIO] M3 P28 C75 - L4[IO] M25 S2 N16X - L3[] N1 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  6] Utilization = 1.00 | pJ/Compute =   26.088 | L5[WIO] M3 P28 C75 - L4[O] S2 M25 N16X - L3[] N1 - L2[W] N1 - L1[I] N1 - L0[O] N1 
[  4] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  7] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  2] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  5] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  3] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  1] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.
[  6] STATEMENT: 800 suboptimal mappings found since the last upgrade, terminating search.




Summary stats for best mapping found by mapper:
  Utilization = 1.00 | pJ/Compute =   23.663
119.26 5040000 315000
HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_6.yaml _layers/map_6.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configura

[  0] STATEMENT: 1500 invalid mappings (1451 fanout, 49 capacity) found since the last valid mapping, terminating search.
[  2] Utilization = 1.00 | pJ/Compute =  155.342 | L5[WIO] O8 I84 - L4[] O1 N16X - L3[] O2 I25 - L2[W] O1 - L1[I] O1 - L0[O] O1 
[  2] Utilization = 1.00 | pJ/Compute =   91.742 | L5[WIO] O8 I84 - L4[] O1 N16X - L3[I] O2 I25 - L2[W] O1 - L1[I] O1 - L0[O] O1 
[  3] Utilization = 1.00 | pJ/Compute =  150.203 | L5[WIO] O2 I14 - L4[] O4 I3 N16X - L3[] O2 I50 - L2[W] O1 - L1[I] O1 - L0[O] O1 
[  2] Utilization = 1.00 | pJ/Compute =   81.645 | L5[WIO] O8 I84 - L4[] O1 N16X - L3[IO] O2 I25 - L2[W] O1 - L1[I] O1 - L0[O] O1 
[  3] Utilization = 1.00 | pJ/Compute =  146.802 | L5[WIO] O2 I14 - L4[] O4 I3 N16X - L3[O] O2 I50 - L2[W] O1 - L1[I] O1 - L0[O] O1 
[  3] Utilization = 1.00 | pJ/Compute =  145.264 | L5[WIO] O2 I14 - L4[O] O4 I3 N16X - L3[] O2 I50 - L2[W] O1 - L1[I] O1 - L0[O] O1 
[  3] Utilization = 1.00 | pJ/Compute =  145.237 | L5[WIO] O2 I14 - L4[O] O4 I3 N16X - L3[



Summary stats for best mapping found by mapper:
  Utilization = 1.00 | pJ/Compute =   35.625
19.15 537600 33600
layer types: ['conv3d', 'conv3d', 'conv3d', 'conv3d', 'conv3d', 'conv3d', 'linear']
energy: 3400.11 (layerwise: [419.96, 519.12, 1628.2, 347.21, 347.21, 119.26, 19.15])
mac: 141696480 (layerwise: [16951680, 18835200, 70632000, 14850000, 14850000, 5040000, 537600])
param: 120840 (layerwise: [540, 1200, 40500, 16875, 16875, 11250, 33600])
cycle: 8856030 (layerwise: [1059480, 1177200, 4414500, 928125, 928125, 315000, 33600])


In [2]:
class HamidaEtAl2d(nn.Module):
    """
    3-D Deep Learning Approach for Remote Sensing Image Classification
    Amina Ben Hamida, Alexandre Benoit, Patrick Lambert, Chokri Ben Amar
    IEEE TGRS, 2018
    https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8344565
    """
    @staticmethod
    def weight_init(m):
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
            init.kaiming_normal_(m.weight)
            init.zeros_(m.bias)

    def __init__(self, input_channels, n_classes, patch_size=5, dilation=1):
        super(HamidaEtAl2d, self).__init__()
        # The first layer is a (3,3,3) kernel sized Conv characterized
        # by a stride equal to 1 and number of neurons equal to 20
        self.patch_size = patch_size
        self.input_channels = input_channels
        # dilation = (dil)

        if patch_size == 3:
            self.conv1 = nn.Conv2d(
                200, 20, (3, 3), stride=(1, 1), dilation=dilation, padding=1)
        else:
            self.conv1 = nn.Conv2d(
                200, 20, (3, 3), stride=(1, 1), dilation=dilation, padding=0)
        # Next pooling is applied using a layer identical to the previous one
        # with the difference of a 1D kernel size (1,1,3) and a larger stride
        # equal to 2 in order to reduce the spectral dimension
        self.pool1 = nn.Conv2d(
            20, 600, (3, 1), dilation=dilation, stride=(2, 1), padding=(1, 0))
        # Then, a duplicate of the first and second layers is created with
        # 35 hidden neurons per layer.
        self.conv2 = nn.Conv2d(
            600, 500, (3, 3), dilation=dilation, stride=(1, 1), padding=(1, 0))
        self.pool2 = nn.Conv2d(
            500, 500, (3, 1), dilation=dilation, stride=(2, 1), padding=(1, 0))
        # Finally, the 1D spatial dimension is progressively reduced
        # thanks to the use of two Conv layers, 35 neurons each,
        # with respective kernel sizes of (1,1,3) and (1,1,2) and strides
        # respectively equal texo (1,1,1) and (1,1,2)
        self.conv3 = nn.Conv2d(
            500, 800, (3, 1), dilation=dilation, stride=(1, 1), padding=(1, 0))
        self.conv4 = nn.Conv2d(
            800, 1000, (2, 1), dilation=dilation, stride=(2, 1), padding=(1, 0))

        #self.dropout = nn.Dropout(p=0.5)

        self.features_size = self._get_final_flattened_size()
        # The architecture ends with a fully connected layer where the number
        # of neurons is equal to the number of input classes.
        self.fc = nn.Linear(self.features_size, n_classes)

        # self.apply(self.weight_init)

    def _get_final_flattened_size(self):

        with torch.no_grad():
            x = torch.zeros((1, self.input_channels,
                             self.patch_size, self.patch_size))
            x = self.pool1(self.conv1(x))
            x = self.pool2(self.conv2(x))
            x = self.conv3(x)
            x = self.conv4(x)
            print(x.shape)
            _, c, w, h = x.size()
        return  c * w * h

    def forward(self, x):
        x = x.squeeze()

        x = F.relu(self.conv1(x))

        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = x.view(-1, self.features_size)
        #x = self.dropout(x)
        x = self.fc(x)
        return x

In [None]:
# Hamida 2D
model = HamidaEtAl2d(input_channels=200, n_classes=16)
x = torch.rand(16, 200, 5, 5)

types, energy, l_energy, mac, l_mac, param, l_param, cycle, l_cycle = get_energy(model, x, verbose=False)
print("layer types:", types)
print("energy:", energy, f"(layerwise: {l_energy})")
print("mac:", mac, f"(layerwise: {l_mac})")
print("param:", param, f"(layerwise: {l_param})")
print("cycle:", cycle, f"(layerwise: {l_cycle})")

  self.weight = Parameter(torch.empty(


torch.Size([1, 1000, 1, 1])
HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_0.yaml _layers/map_0.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along p

[  4] Utilization = 1.00 | pJ/Compute =  337.541 | L5[WIO] Q3 P3 R3 C200 - L4[] M10 N16X - L3[] S3 M2 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  4] Utilization = 1.00 | pJ/Compute =  280.087 | L5[WIO] Q3 P3 R3 C200 - L4[] M10 N16X - L3[I] S3 M2 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  4] Utilization = 1.00 | pJ/Compute =  167.495 | L5[WIO] Q3 P3 R3 C200 - L4[] M10 N16X - L3[O] S3 M2 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  4] Utilization = 1.00 | pJ/Compute =  110.042 | L5[WIO] Q3 P3 R3 C200 - L4[] M10 N16X - L3[IO] S3 M2 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  4] Utilization = 1.00 | pJ/Compute =   84.639 | L5[WIO] Q3 P3 R3 C200 - L4[O] M10 N16X - L3[] S3 M2 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  4] Utilization = 1.00 | pJ/Compute =   27.185 | L5[WIO] Q3 P3 R3 C200 - L4[O] M10 N16X - L3[I] S3 M2 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  3] Utilization = 1.00 | pJ/Compute =  396.045 | L5[WIO] R3 C200 - L4[] Q3 M20 N16X - L3[] S3 - L2[W] P3 - L1[I] Q1 - L0[O] Q1 
[  3] Utilization = 1.00 | pJ/Compute =  394.30



Summary stats for best mapping found by mapper:
  Utilization = 1.00 | pJ/Compute =   25.185
130.56 5184000 324000
HERE: designs/system_manual/arch/system_arch_1x16.yaml designs/system_manual/arch/components _layers/layer_shape_1.yaml _layers/map_1.yaml designs/system_auto/mapper/mapper.yaml
input file: tmp.yaml
  _______                __                
 /_  __(_)___ ___  ___  / /___  ____  ____ 
  / / / / __ `__ \/ _ \/ / __ \/ __ \/ __ \
 / / / / / / / / /  __/ / /_/ / /_/ / /_/ /
/_/ /_/_/ /_/ /_/\___/_/\____/\____/ .___/ 
                                  /_/      

Problem configuration complete.
execute:/usr/local/bin/accelergy tmp.yaml --oprefix timeloop-mapper. -o ./ > timeloop-mapper.accelergy.log 2>&1
Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configura

[  1] STATEMENT: 1500 invalid mappings (1478 fanout, 22 capacity) found since the last valid mapping, terminating search.
[  0] STATEMENT: 1500 invalid mappings (1474 fanout, 26 capacity) found since the last valid mapping, terminating search.
[  2] STATEMENT: 1500 invalid mappings (1474 fanout, 26 capacity) found since the last valid mapping, terminating search.
[  3] STATEMENT: 1500 invalid mappings (1473 fanout, 27 capacity) found since the last valid mapping, terminating search.
[  7] STATEMENT: 1500 invalid mappings (1472 fanout, 28 capacity) found since the last valid mapping, terminating search.
[  5] STATEMENT: 1500 invalid mappings (1473 fanout, 27 capacity) found since the last valid mapping, terminating search.
[  6] STATEMENT: 1500 invalid mappings (1473 fanout, 27 capacity) found since the last valid mapping, terminating search.
[  4] STATEMENT: 1500 invalid mappings (1473 fanout, 27 capacity) found since the last valid mapping, terminating search.


Generate Accelergy ERT (energy reference table) to replace internal energy model.
Generate Accelergy ART (area reference table) to replace internal area model.
Architecture configuration complete.
Sparse optimization configuration complete.
Using threads = 8
Mapper configuration complete.
Initializing Index Factorization subspace.
  Factorization options along problem dimension C = 1
  Factorization options along problem dimension M = 1
  Factorization options along problem dimension R = 7
  Factorization options along problem dimension S = 1
  Factorization options along problem dimension N = 1
  Factorization options along problem dimension P = 1
  Factorization options along problem dimension Q = 1
Mapspace Dimension [IndexFactorization] Size: 7
Mapspace Dimension [LoopPermutation] Size: 3252016064102400000
Mapspace Dimension [Spatial] Size: 8
Mapspace Dimension [DatatypeBypass] Size: 64
Mapspace split! Per-split Mapping Dimension [IndexFactorization] Size: 1 Residue: 1
Mapspace con

[  7] STATEMENT: search algorithm is done, terminating search.
[  3] Utilization = 1.00 | pJ/Compute =  147.258 | L5[WIO] Q3 P2 M600 C20 - L4[] Q1 N16X - L3[] R3 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  5] Utilization = 1.00 | pJ/Compute =  147.258 | L5[WIO] Q3 P2 M600 C20 - L4[] R3 N16X - L3[] Q1 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  4] STATEMENT: 1500 invalid mappings (1500 fanout, 0 capacity) found since the last valid mapping, terminating search.
[  6] Utilization = 1.00 | pJ/Compute =  155.824 | L5[WIO] Q3 P2 R3 M600 C20 - L4[] Q1 N16X - L3[] Q1 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  3] Utilization = 1.00 | pJ/Compute =  101.114 | L5[WIO] Q3 P2 C20 M600 - L4[] Q1 N16X - L3[I] R3 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  5] Utilization = 1.00 | pJ/Compute =  102.194 | L5[WIO] Q3 P2 C20 M600 - L4[I] R3 N16X - L3[] Q1 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  6] Utilization = 1.00 | pJ/Compute =  147.258 | L5[WIO] Q3 P2 M600 R3 C20 - L4[] Q1 N16X - L3[] Q1 - L2[W] Q1 - L1[I] Q1 - L0[O] Q1 
[  3] STAT