## Step 3: Simulated Annealing for Interlayer Dependency

Run simulated annealing to run joint search over multiple layers. 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import yaml
import shutil
from pathlib import Path

First, we have to define the architecture and the workload similar as before..

In [2]:
configuration_dict = {}

# template design (with constraints and memory hierarchy representing "dataflow")
configuration_dict['TEMPLATE_DESIGN'] = 'eyeriss_like'

# number of bits used for I/O/W; we assume integer
configuration_dict['WORDBITS'] = 16

# DRAM bandwidth setting: words / cycle (not bits / cycle)
configuration_dict['DRAM_READ_BANDWIDTH'] = 32
configuration_dict['DRAM_WRITE_BANDWIDTH'] = 32

# SRAM setting
# - do we have a single shared glb or multiple glbs for each datatype? 
# - for each glb (if shared, just one), define depth/width/#banks and bandwidths
configuration_dict['SRAM_SHARED'] = True
configuration_dict['SRAM_DEPTH'] = [2 ** 13]
configuration_dict['SRAM_WIDTH'] = [2 ** 7]
configuration_dict['SRAM_BANKS'] = [32]                     # SRAM width and SRAM banks define the maximum possible bandwidth
configuration_dict['SRAM_READ_BANDWIDTH'] = [32]
configuration_dict['SRAM_WRITE_BANDWIDTH'] = [32]

# PE array setting
# - shape of PE array X x Y
# - whether a PE has a shared scratchpad or separate scratchpads for each datatype
configuration_dict['PE_X'] = 14
configuration_dict['PE_Y'] = 12
configuration_dict['PE_SPAD_SHARED'] = False
configuration_dict['PE_SPAD_DEPTH'] = [192, 12, 16]         # Weight, IFmap, OFmap
configuration_dict['PE_SPAD_WIDTH'] = [16, 16, 16]

# Cryptographic engine setting
# - type of cryptographic engine + dram (LPDDR4 + AES-GCM)
# - cycle for AES-GCM 
# - whether the cryptographic engines are shared among all datatypes or assigned to each datatype
configuration_dict['CRYPT_ENGINE_TYPE'] = 'effective_lpddr4_aesgcm'
configuration_dict['CRYPT_ENGINE_CYCLE_PER_BLOCK'] = 11            # avg. cycle/128bit

configuration_dict['CRYPT_ENGINE_SHARED'] = False
configuration_dict['CRYPT_ENGINE_COUNT'] = [1, 1, 1]

configuration_dict['EFFECTIVE_CONSERVATIVE'] = True

# Create directory for this configuration if it doesn't exist already
# iterate through design folders to check if any pre-exisiting folder
design_dir = 'designs/{}'.format(configuration_dict['TEMPLATE_DESIGN'])
arch_dir = None
total_vers = 0
for path in os.listdir(design_dir):
    if path != 'template' and os.path.isdir(os.path.join(design_dir, path)):
        try:
            with open(os.path.join(design_dir, path, 'config.yaml'), 'r') as f:
                config_file = yaml.safe_load(f)
            total_vers += 1
            if config_file == configuration_dict:
                arch_dir = path
                print("Pre-existing folder found. Setting the arch_dir to {}".format(arch_dir))
                break
        except:
            print("No config.yaml file in the directory {}".format(str(os.path.join(design_dir, path))))
            
if arch_dir == None:
    raise NameError("Architecture is not found!")

No config.yaml file in the directory designs/eyeriss_like/.ipynb_checkpoints
Pre-existing folder found. Setting the arch_dir to ver1


..else if you know which folder you want to use, specify here instead of running the above cell

In [None]:
design_dir = 'designs/{}'.format('eyeriss_like') # define your design name here

arch_ver = 0
arch_dir = 'ver{}'.format(arch_ver)              # sub directory under designs/{name}/{arch_dir}
with open(os.path.join(design_dir, arch_dir, 'config.yaml'), 'r') as f:
    configuration_dict = yaml.safe_load(f)
print("Setting the architecture directory to: {}".format(os.path.join(design_dir, arch_dir)))
print("Printing configuration:")
for key, value in configuration_dict.items():
    print("{}: {}".format(key, value))

Define the workload here. Skip the pytorch2timeloop conversion (should be done when generating loopnests)

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision.models as model_zoo

import pytorch2timeloop as pytorch2timeloop

# Note: this version only supports nn.Conv2d (both normal convs and depthwise/pointwise convs) and nn.Linear

# AlexNet
# model_name = 'alexnet'
# net = model_zoo.alexnet(pretrained=False)

# ResNet18
# model_name = 'resnet18'
# net = model_zoo.resnet18(pretrained=False)

# MobilenetV2
model_name = 'mobilenet_v2'
net = model_zoo.mobilenet_v2(pretrained=False)

# Input / Batch info
input_size = (3, 224, 224)
batch_size = 1

print(net)

top_dir = 'workloads'
sub_dir = '{}_batch{}'.format(model_name, batch_size)

MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=



In [4]:
base_dir = Path(os.getcwd())
timeloop_dir = 'designs/{}/{}'.format(configuration_dict['TEMPLATE_DESIGN'], arch_dir)

n_layers = 0
layer_dict = {}
layer_duplicate_info = {}
unique_layers = []
for module in net.modules():
    if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
        n_layers += 1
        if n_layers not in layer_dict.keys():
            workload_path = os.path.join(base_dir, top_dir, sub_dir, '{}_layer{}.yaml'.format(sub_dir, n_layers))
            with open(workload_path, 'r') as f:
                workload_info = yaml.safe_load(f)
            layer_dict[n_layers] = workload_info
        
        # identify the earliest duplicate layer
        for key in range(1, n_layers):
            if layer_dict[key] == layer_dict[n_layers]:
                layer_duplicate_info[n_layers] = key
                break
        if n_layers not in layer_duplicate_info:
            unique_layers.append(n_layers)
            
print(layer_duplicate_info)
print(unique_layers)

{10: 7, 16: 13, 17: 14, 18: 15, 19: 13, 25: 22, 26: 23, 27: 24, 28: 22, 29: 23, 30: 24, 31: 22, 32: 23, 37: 34, 38: 35, 39: 36, 40: 34, 46: 43, 47: 44, 48: 45, 49: 43, 50: 44}
[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 33, 34, 35, 36, 41, 42, 43, 44, 45, 51, 52, 53]


In [5]:
from pytorch_layer_dependency_utils import BackpropGraph

workload_path = os.path.join(base_dir, top_dir, sub_dir, 'layer_info_interlayer.yaml')

try:
    with open(workload_path, 'r') as f:
        layer_info = yaml.safe_load(f)
    for layer_idx in range(1, n_layers + 1):
        print(layer_idx, layer_info[layer_idx])
except:

    graph = BackpropGraph(net, [1, input_size[0], input_size[1], input_size[2]])
    consecutive_dict, dependent_dict = graph.get_dependency_info()

    # construct layer_info
    layer_info = {}
    for layer_idx in range(1, n_layers + 1):
        info = {}
        if layer_idx in unique_layers:
            info['layer_id_for_timeloop'] = layer_idx
        else:
            info['layer_id_for_timeloop'] = layer_duplicate_info[layer_idx]
        info['prev_layer'] = []
        info['next_layer'] = []
        info['dependent_prev_layer'] = []
        info['dependent_next_layer'] = []
        layer_info[layer_idx] = info

    for layer_idx in range(1, n_layers + 1):
        consecutive = consecutive_dict[layer_idx]
        dependent = dependent_dict[layer_idx]
        layer_info[layer_idx]['next_layer'].extend(consecutive)
        for i in consecutive:
            layer_info[i]['prev_layer'].append(layer_idx)
        if len(dependent) > 0 and not ignore_interlayer:
            layer_info[layer_idx]['dependent_next_layer'].extend(dependent)   
            for i in dependent:
                layer_info[i]['dependent_prev_layer'].append(layer_idx)

    for layer_idx in range(1, n_layers + 1):
        print(layer_idx, layer_info[layer_idx])

    # store therresults - this can take long for deep models like MobileNet..
    # f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_id}.yaml "
    with open(workload_path, 'w') as f:
        _ = yaml.dump(layer_info, f)


1 {'dependent_next_layer': [2], 'dependent_prev_layer': [], 'layer_id_for_timeloop': 1, 'next_layer': [2], 'prev_layer': []}
2 {'dependent_next_layer': [3], 'dependent_prev_layer': [1], 'layer_id_for_timeloop': 2, 'next_layer': [3], 'prev_layer': [1]}
3 {'dependent_next_layer': [4], 'dependent_prev_layer': [2], 'layer_id_for_timeloop': 3, 'next_layer': [4], 'prev_layer': [2]}
4 {'dependent_next_layer': [5], 'dependent_prev_layer': [3], 'layer_id_for_timeloop': 4, 'next_layer': [5], 'prev_layer': [3]}
5 {'dependent_next_layer': [6], 'dependent_prev_layer': [4], 'layer_id_for_timeloop': 5, 'next_layer': [6], 'prev_layer': [4]}
6 {'dependent_next_layer': [7], 'dependent_prev_layer': [5], 'layer_id_for_timeloop': 6, 'next_layer': [7, 10], 'prev_layer': [5]}
7 {'dependent_next_layer': [8], 'dependent_prev_layer': [6], 'layer_id_for_timeloop': 7, 'next_layer': [8], 'prev_layer': [6]}
8 {'dependent_next_layer': [9], 'dependent_prev_layer': [7], 'layer_id_for_timeloop': 8, 'next_layer': [9], '

### Prepare the folders

In [6]:
if not os.path.exists(os.path.join(base_dir, timeloop_dir, 'joint_topk')):
    os.mkdir(os.path.join(base_dir, timeloop_dir, 'joint_topk'))
if not os.path.exists(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir)):
    os.mkdir(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir))

In [7]:
# Define top-k you used for timeloop-topk
topk = 6

In [8]:
# Copy necessary files to the joint_topk folder
from authblock_assignment import AuthBlockAssignment

base_cost_dict, base_rehash_cost_dict, base_block_info_dict = AuthBlockAssignment(n_layers, layer_info, \
                                                                                  base_dir, timeloop_dir, top_dir, sub_dir, \
                                                                                  configuration_dict, \
                                                                                  mode="search", \
                                                                                  joint=False, return_cost_dict=True)

baseline_energy = 0
baseline_latency = 0
baseline_add_mem_traffic = 0

for key in base_cost_dict:
    baseline_energy += base_cost_dict[key]['total_energy'] / 10**6
    baseline_latency += base_cost_dict[key]['total_latency']
    baseline_add_mem_traffic += base_cost_dict[key]['add_memory_traffic']
for key in base_rehash_cost_dict:
    baseline_energy += base_rehash_cost_dict[key]['total_energy'] / 10**6
    baseline_latency += base_rehash_cost_dict[key]['total_latency']
    baseline_add_mem_traffic += base_rehash_cost_dict[key]['add_memory_traffic']   
    
for layer_idx in range(1, n_layers + 1):
    work_dir = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer_idx))
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
        
    # """
    for k in range(1, topk + 1):
        if not os.path.exists(os.path.join(work_dir, 'eval{}'.format(k))):
            os.mkdir(os.path.join(work_dir, 'eval{}'.format(k)))
        layer_id_for_timeloop = layer_info[layer_idx]['layer_id_for_timeloop']
        cwd = f"{base_dir/timeloop_dir/'joint_topk'/sub_dir/f'layer{layer_idx}'/f'eval{k}'}"
        if 'M' in workload_info['problem']['instance']:
            constraint_pth = base_dir/timeloop_dir/'constraints/*.yaml'
        else:
            # depthwise
            constraint_pth = base_dir/timeloop_dir/'constraints_dw/*.yaml'

        timeloopcmd = f"timeloop-model " \
              f"{base_dir/timeloop_dir/'arch/baseline.yaml'} " \
              f"{base_dir/timeloop_dir/'arch/components/*.yaml'} " \
              f"{base_dir/timeloop_dir/'scheduling'/sub_dir/f'layer{layer_id_for_timeloop}'/f'mapping{k}.yaml'} " \
              f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_idx}.yaml "
        
        try:
            os.chdir(cwd)
        except:
            os.mkdir(cwd)
            os.chdir(cwd)
        os.system(timeloopcmd)
        os.chdir(base_dir)
    # """

    # copy mapping1's result into here
    shutil.copy(os.path.join(work_dir, 'eval1', 'timeloop-model.map+stats.xml'), work_dir)


execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer1/mapping1.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer1.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.21 | pJ/Compute =   13.194
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer1/mapping2.yaml /home/workspace/sche

Utilization = 0.10 | pJ/Compute =   20.174
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer3/mapping3.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer3.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.33 | pJ/Compute =   20.189
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch

Utilization = 0.28 | pJ/Compute =  104.715
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer5/mapping5.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer5.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.21 | pJ/Compute =  113.438
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch

Failed to run Accelergy. Did you install Accelergy or specify ACCELERGYPATH correctly? Or check accelergy.log to see what went wrong
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer8/mapping1.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer8.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.25 | pJ/Compute =   49.312
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SR

Utilization = 0.75 | pJ/Compute =   13.428
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer7/mapping3.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer10.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.86 | pJ/Compute =   13.389
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batc

Utilization = 0.33 | pJ/Compute =   16.091
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer12/mapping5.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer12.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.50 | pJ/Compute =   16.110
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Failed to run Accelergy. Did you install Accelergy or specify ACCELERGYPATH correctly? Or check accelergy.log to see what went wrong
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer15/mapping1.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer15.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.17 | pJ/Compute =   15.849
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_

Utilization = 0.50 | pJ/Compute =   50.402
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer14/mapping3.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer17.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.53 | pJ/Compute =   50.402
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Utilization = 0.67 | pJ/Compute =   12.096
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer13/mapping5.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer19.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.67 | pJ/Compute =   12.497
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Failed to run Accelergy. Did you install Accelergy or specify ACCELERGYPATH correctly? Or check accelergy.log to see what went wrong
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer22/mapping1.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer22.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.67 | pJ/Compute =    9.486
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_

Utilization = 0.50 | pJ/Compute =   10.781
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer24/mapping3.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer24.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.33 | pJ/Compute =   10.771
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Utilization = 0.50 | pJ/Compute =   52.581
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer23/mapping5.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer26.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.32 | pJ/Compute =   61.305
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Failed to run Accelergy. Did you install Accelergy or specify ACCELERGYPATH correctly? Or check accelergy.log to see what went wrong
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer23/mapping1.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer29.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.50 | pJ/Compute =   52.581
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_

Utilization = 0.67 | pJ/Compute =    9.545
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer22/mapping3.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer31.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.67 | pJ/Compute =    9.887
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Utilization = 0.71 | pJ/Compute =   12.830
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer33/mapping5.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer33.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.33 | pJ/Compute =   13.252
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Failed to run Accelergy. Did you install Accelergy or specify ACCELERGYPATH correctly? Or check accelergy.log to see what went wrong
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer36/mapping1.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer36.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.75 | pJ/Compute =   10.584
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_

Utilization = 0.50 | pJ/Compute =   52.581
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer35/mapping3.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer38.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.25 | pJ/Compute =   52.581
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Utilization = 1.00 | pJ/Compute =   12.088
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer34/mapping5.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer40.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.57 | pJ/Compute =   13.701
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Failed to run Accelergy. Did you install Accelergy or specify ACCELERGYPATH correctly? Or check accelergy.log to see what went wrong
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer43/mapping1.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer43.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.33 | pJ/Compute =   10.518
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_

Utilization = 1.00 | pJ/Compute =   10.249
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer45/mapping3.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer45.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.83 | pJ/Compute =   10.415
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Utilization = 0.38 | pJ/Compute =   61.256
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer44/mapping5.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer47.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.25 | pJ/Compute =   69.979
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

Failed to run Accelergy. Did you install Accelergy or specify ACCELERGYPATH correctly? Or check accelergy.log to see what went wrong
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer44/mapping1.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer50.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.44 | pJ/Compute =   61.256
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_

Utilization = 0.33 | pJ/Compute =   10.814
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_batch1/layer52/mapping3.yaml /home/workspace/scheduling/workloads/mobilenet_v2_batch1/mobilenet_v2_batch1_layer52.yaml --oprefix timeloop-model. -o ./ > timeloop-model.accelergy.log 2>&1
Utilization = 0.33 | pJ/Compute =   10.954
execute:/usr/local/bin/accelergy /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/baseline.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_RF.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/arch/components/smartbuffer_SRAM.yaml /home/workspace/scheduling/designs/eyeriss_like/ver1/scheduling/mobilenet_v2_bat

### Run simulated annealing

Define the hyperparameters here:

In [9]:
initial_temp = 100
final_temp = 0.1
n_iters = 1000

cooling_scheduler = 'linear'

# Define layer idx if you don't want to search them for simulated anneling 
# (e.g., non-conv layers in AlexNet)
layers_exclude_from_search = []

# TODO: this option should not be used for ResNet18 - bug with dependent layer partial update due to residuals
use_partial_update = True

Run simulated annealing

In [10]:
import random
import time
import csv
import copy
import math

from authblock_assignment import PartialUpdateAuthBlockAssignment

csv_header = ['Iter', 'Temp', \
              'Cost (J x cycles)', 'Total Latency (cycles)', 'Total Energy (uJ)', 'Additional Off-chip Traffic (bits)']
logs = []

solution_cost_dict = copy.deepcopy(base_cost_dict)
solution_rehash_cost_dict = copy.deepcopy(base_rehash_cost_dict)
solution_block_info_dict = copy.deepcopy(base_block_info_dict)

current_cost_dict = copy.deepcopy(base_cost_dict)
current_rehash_cost_dict = copy.deepcopy(base_rehash_cost_dict)
current_block_info_dict = copy.deepcopy(base_block_info_dict)

solution_state = [1] * n_layers
current_state = [1] * n_layers
best_state = [1] * n_layers

i = 0
cost_best = baseline_energy * baseline_latency

layers_for_search = []
for idx in range(1, n_layers + 1):
    if len(layer_info[idx]['dependent_next_layer']) > 0 or len(layer_info[idx]['dependent_prev_layer']) > 0:
        if idx not in layers_exclude_from_search:
            layers_for_search.append(idx)
            
start_time = time.time()
while i < n_iters + 1:
    # temperature
    if cooling_scheduler == 'linear':
        current_temp = final_temp + (initial_temp - final_temp) / float(n_iters) * float(n_iters - i)
    elif cooling_scheduler == 'cosine':
        current_temp = final_temp + 0.5 * (initial_temp - final_temp) * (1 + math.cos(float(i) * math.pi / float(n_iters)))
    elif cooling_scheduler == 'quadratic':
        current_temp = final_temp + (initial_temp - final_temp) * (float(n_iters - i) / float(n_iters))**2
    
    layer2change = random.choice(layers_for_search)
    neighbor_loopnest = random.choice(list(range(1, topk + 1)))
    
    current_state[layer2change - 1] = neighbor_loopnest
    stats_file = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, "layer{}".format(layer2change), \
                              "eval{}".format(neighbor_loopnest), "timeloop-model.stats.txt")
    with open(stats_file, 'r') as f:
        lines = f.read().split('\n')[-200:]
        for line in lines:
            if line.startswith('Energy'):
                energy = eval(line.split(': ')[1].split(' ')[0]) * float(10**6) # micro to pico
                # print(energy)
            elif line.startswith('Cycles'):
                cycle = eval(line.split(': ')[1])
    current_cost_dict[layer2change]['timeloop_energy'] = energy
    current_cost_dict[layer2change]['timeloop_cycle'] = cycle
    
    xml_file = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, "layer{}".format(layer2change), \
                            "eval{}".format(neighbor_loopnest), "timeloop-model.map+stats.xml")
    shutil.copy(xml_file, os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer2change)))
    
    if use_partial_update:
        subset_layers = [layer2change]
        subset_layers.extend(layer_info[layer2change]['prev_layer'])
        subset_layers.extend(layer_info[layer2change]['next_layer'])
        
        current_cost_dict, current_rehash_cost_dict, current_block_info_dict = \
        PartialUpdateAuthBlockAssignment(n_layers, layer_info, \
                                         base_dir, timeloop_dir, top_dir, sub_dir, \
                                         configuration_dict, mode="search", \
                                         prev_block_info_dict=current_block_info_dict, subset_layers=subset_layers, \
                                         prev_cost_dict=current_cost_dict, prev_rehash_cost_dict=current_rehash_cost_dict)
        
    else:
        current_cost_dict, current_rehash_cost_dict, current_block_info_dict = \
        PartialUpdateAuthBlockAssignment(n_layers, layer_info, \
                                         base_dir, timeloop_dir, top_dir, sub_dir, \
                                         configuration_dict, \
                                         mode="search", \
                                         prev_block_info_dict=None, subset_layers=[], \
                                         prev_cost_dict=current_cost_dict, prev_rehash_cost_dict=None)
        
    solution_energy, solution_latency, solution_add_mem_traffic = 0, 0, 0
    for key in solution_cost_dict:
        solution_energy += solution_cost_dict[key]['total_energy'] / 10**6
        solution_latency += solution_cost_dict[key]['total_latency']
        solution_add_mem_traffic += solution_cost_dict[key]['add_memory_traffic']
    for key in solution_rehash_cost_dict:
        solution_energy += solution_rehash_cost_dict[key]['total_energy'] / 10**6
        solution_latency += solution_rehash_cost_dict[key]['total_latency']
        solution_add_mem_traffic += solution_rehash_cost_dict[key]['add_memory_traffic']
    
    current_energy, current_latency, current_add_mem_traffic = 0, 0, 0
    for key in current_cost_dict:
        current_energy += current_cost_dict[key]['total_energy'] / 10**6
        current_latency += current_cost_dict[key]['total_latency']
        current_add_mem_traffic += current_cost_dict[key]['add_memory_traffic']
    for key in current_rehash_cost_dict:
        current_energy += current_rehash_cost_dict[key]['total_energy'] / 10**6
        current_latency += current_rehash_cost_dict[key]['total_latency']
        current_add_mem_traffic += current_rehash_cost_dict[key]['add_memory_traffic']
    
    cost_solution = solution_energy * solution_latency
    cost_current = current_energy * current_latency
    cost_diff = (cost_solution - cost_current) / (10 ** 6 * n_layers)
    
    if cost_current < cost_best:
        best_state = copy.deepcopy(current_state)
        cost_best = cost_current
        print("Found best so far: ", best_state, " .. updating cost_best: {}".format(cost_best))
        
        # for i in range(1, n_layers + 1):
        #     print(current_cost_dict[i]['total_energy'])
        # print("-----------------------------------------------")
        # for key in current_rehash_cost_dict.keys():
        #     print(current_rehash_cost_dict[key]['total_energy'])
        #    
        # print(current_latency, current_energy)
        # break
        
    if cost_diff > 0 or (random.uniform(0, 1) < math.exp(cost_diff / current_temp)):
        solution_state = copy.deepcopy(current_state)
        solution_cost_dict = copy.deepcopy(current_cost_dict)
        solution_rehash_cost_dict = copy.deepcopy(current_rehash_cost_dict)
        solution_block_info_dict = copy.deepcopy(current_block_info_dict)
    else:
        # roll-back to the solution state
        xml_file = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, "layer{}".format(layer2change), \
                                  "eval{}".format(solution_state[layer2change - 1]), "timeloop-model.map+stats.xml")
        shutil.copy(xml_file, os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer2change)))
        current_state = copy.deepcopy(solution_state)
        current_cost_dict = copy.deepcopy(solution_cost_dict)
        current_rehash_cost_dict = copy.deepcopy(solution_rehash_cost_dict)
        current_block_info_dict = copy.deepcopy(solution_block_info_dict)
    
    solution_energy, solution_latency, solution_add_mem_traffic = 0, 0, 0
    for key in solution_cost_dict:
        solution_energy += solution_cost_dict[key]['total_energy'] / 10**6
        solution_latency += solution_cost_dict[key]['total_latency']
        solution_add_mem_traffic += solution_cost_dict[key]['add_memory_traffic']
    for key in solution_rehash_cost_dict:
        solution_energy += solution_rehash_cost_dict[key]['total_energy'] / 10**6
        solution_latency += solution_rehash_cost_dict[key]['total_latency']
        solution_add_mem_traffic += solution_rehash_cost_dict[key]['add_memory_traffic']
        
    # print("Solution state: ", solution_state)
    print("Current iteration: {} (temperature: {:.2f}) -- Latency: {} ({:.2f}% faster), Energy: {} uW ({:.2f}% lower), Add Mem Traffic: {} bits ({:.2f}% smaller)"\
          .format(i+1, current_temp, solution_latency, (baseline_latency - solution_latency) / float(baseline_latency) * 100. , \
                  solution_energy, (baseline_energy - solution_energy) / baseline_energy * 100., \
                  solution_add_mem_traffic, (baseline_add_mem_traffic - solution_add_mem_traffic) / float(baseline_add_mem_traffic) * 100.))

    curr_log = [(i + 1), current_temp, cost_solution, solution_latency, solution_energy, solution_add_mem_traffic]
    logs.append(curr_log)
    i += 1
    
    if current_temp < final_temp:
        break
        
print("Execution time: {}s".format(time.time() - start_time))

# dump to csv file
with open(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'SA_{}_top{}_summary.csv'.format(cooling_scheduler, topk)), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(csv_header)
    writer.writerows(logs)
    
# dump best state & solution state to yaml file
state = {'best': best_state, 'final': solution_state}
with open(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'SA_{}_state.yaml'.format(cooling_scheduler)), 'w') as f:
    _ = yaml.dump(state, f)
        

Current iteration: 1 (temperature: 100.00) -- Latency: 20948946.0 (0.00% faster), Energy: 5588.06985392 uW (0.00% lower), Add Mem Traffic: 16643584.0 bits (0.00% smaller)
Found best so far:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]  .. updating cost_best: 116957578000.43716
Current iteration: 2 (temperature: 99.90) -- Latency: 20929974.0 (0.09% faster), Energy: 5588.042202080001 uW (0.00% lower), Add Mem Traffic: 16670720.0 bits (-0.16% smaller)
Current iteration: 3 (temperature: 99.80) -- Latency: 20929974.0 (0.09% faster), Energy: 5603.638301280001 uW (-0.28% lower), Add Mem Traffic: 16691200.0 bits (-0.29% smaller)
Current iteration: 4 (temperature: 99.70) -- Latency: 20929974.0 (0.09% faster), Energy: 5603.58830128 uW (-0.28% lower), Add Mem Traffic: 16691200.0 bits (-0.29% smaller)
Current iteration: 5 (temperature: 99.60) -- Latency: 20931108.0 (0.09% faster), En

Current iteration: 47 (temperature: 95.40) -- Latency: 20747670.0 (0.96% faster), Energy: 5760.58247984 uW (-3.09% lower), Add Mem Traffic: 16461824.0 bits (1.09% smaller)
Current iteration: 48 (temperature: 95.30) -- Latency: 20785404.0 (0.78% faster), Energy: 5767.444669919999 uW (-3.21% lower), Add Mem Traffic: 16469120.0 bits (1.05% smaller)
Current iteration: 49 (temperature: 95.20) -- Latency: 20787726.0 (0.77% faster), Energy: 5770.7601195199995 uW (-3.27% lower), Add Mem Traffic: 16484032.0 bits (0.96% smaller)
Current iteration: 50 (temperature: 95.10) -- Latency: 20787726.0 (0.77% faster), Energy: 5770.7601195199995 uW (-3.27% lower), Add Mem Traffic: 16484032.0 bits (0.96% smaller)
Current iteration: 51 (temperature: 95.00) -- Latency: 20823108.0 (0.60% faster), Energy: 5772.748044639999 uW (-3.30% lower), Add Mem Traffic: 16488512.0 bits (0.93% smaller)
Current iteration: 52 (temperature: 94.91) -- Latency: 20899140.0 (0.24% faster), Energy: 5774.951083359999 uW (-3.34% low

Current iteration: 94 (temperature: 90.71) -- Latency: 20783754.0 (0.79% faster), Energy: 5731.3632395199975 uW (-2.56% lower), Add Mem Traffic: 14704832.0 bits (11.65% smaller)
Current iteration: 95 (temperature: 90.61) -- Latency: 20783754.0 (0.79% faster), Energy: 5743.628026719998 uW (-2.78% lower), Add Mem Traffic: 14712512.0 bits (11.60% smaller)
Current iteration: 96 (temperature: 90.51) -- Latency: 20783754.0 (0.79% faster), Energy: 5743.598026719998 uW (-2.78% lower), Add Mem Traffic: 14712512.0 bits (11.60% smaller)
Current iteration: 97 (temperature: 90.41) -- Latency: 20783754.0 (0.79% faster), Energy: 5743.598026719998 uW (-2.78% lower), Add Mem Traffic: 14712512.0 bits (11.60% smaller)
Current iteration: 98 (temperature: 90.31) -- Latency: 20783754.0 (0.79% faster), Energy: 5743.598026719998 uW (-2.78% lower), Add Mem Traffic: 14712512.0 bits (11.60% smaller)
Current iteration: 99 (temperature: 90.21) -- Latency: 20783754.0 (0.79% faster), Energy: 5743.528026719999 uW (-2

Current iteration: 141 (temperature: 86.01) -- Latency: 20394066.0 (2.65% faster), Energy: 5761.0567576 uW (-3.10% lower), Add Mem Traffic: 14124736.0 bits (15.13% smaller)
Current iteration: 142 (temperature: 85.91) -- Latency: 20418450.0 (2.53% faster), Energy: 5731.125872480001 uW (-2.56% lower), Add Mem Traffic: 14267072.0 bits (14.28% smaller)
Current iteration: 143 (temperature: 85.81) -- Latency: 20423250.0 (2.51% faster), Energy: 5731.19533872 uW (-2.56% lower), Add Mem Traffic: 14283328.0 bits (14.18% smaller)
Current iteration: 144 (temperature: 85.71) -- Latency: 20423250.0 (2.51% faster), Energy: 5730.98533872 uW (-2.56% lower), Add Mem Traffic: 14283328.0 bits (14.18% smaller)
Current iteration: 145 (temperature: 85.61) -- Latency: 20423250.0 (2.51% faster), Energy: 5730.98533872 uW (-2.56% lower), Add Mem Traffic: 14283328.0 bits (14.18% smaller)
Current iteration: 146 (temperature: 85.51) -- Latency: 20423250.0 (2.51% faster), Energy: 5730.8753387199995 uW (-2.56% lower)

Current iteration: 188 (temperature: 81.32) -- Latency: 20694942.0 (1.21% faster), Energy: 5979.120211040001 uW (-7.00% lower), Add Mem Traffic: 15674752.0 bits (5.82% smaller)
Current iteration: 189 (temperature: 81.22) -- Latency: 20694942.0 (1.21% faster), Energy: 5978.8102110400005 uW (-6.99% lower), Add Mem Traffic: 15674752.0 bits (5.82% smaller)
Current iteration: 190 (temperature: 81.12) -- Latency: 20692062.0 (1.23% faster), Energy: 5978.90871728 uW (-6.99% lower), Add Mem Traffic: 15313408.0 bits (7.99% smaller)
Current iteration: 191 (temperature: 81.02) -- Latency: 20692062.0 (1.23% faster), Energy: 5978.90871728 uW (-6.99% lower), Add Mem Traffic: 15313408.0 bits (7.99% smaller)
Current iteration: 192 (temperature: 80.92) -- Latency: 20692074.0 (1.23% faster), Energy: 5976.1127147199995 uW (-6.94% lower), Add Mem Traffic: 15313280.0 bits (7.99% smaller)
Current iteration: 193 (temperature: 80.82) -- Latency: 20691114.0 (1.23% faster), Energy: 5972.99618992 uW (-6.89% lower

Current iteration: 235 (temperature: 76.62) -- Latency: 20996406.0 (-0.23% faster), Energy: 5815.85522432 uW (-4.08% lower), Add Mem Traffic: 15178304.0 bits (8.80% smaller)
Current iteration: 236 (temperature: 76.52) -- Latency: 20984598.0 (-0.17% faster), Energy: 5816.269011199999 uW (-4.08% lower), Add Mem Traffic: 15021120.0 bits (9.75% smaller)
Current iteration: 237 (temperature: 76.42) -- Latency: 20984598.0 (-0.17% faster), Energy: 5816.269011199999 uW (-4.08% lower), Add Mem Traffic: 15021120.0 bits (9.75% smaller)
Current iteration: 238 (temperature: 76.32) -- Latency: 21012252.0 (-0.30% faster), Energy: 5769.365267839999 uW (-3.24% lower), Add Mem Traffic: 15123648.0 bits (9.13% smaller)
Current iteration: 239 (temperature: 76.22) -- Latency: 20936214.0 (0.06% faster), Energy: 5885.18023936 uW (-5.32% lower), Add Mem Traffic: 15125568.0 bits (9.12% smaller)
Current iteration: 240 (temperature: 76.12) -- Latency: 20900832.0 (0.23% faster), Energy: 5889.009271680001 uW (-5.39%

Current iteration: 282 (temperature: 71.93) -- Latency: 20752608.0 (0.94% faster), Energy: 5913.41838464 uW (-5.82% lower), Add Mem Traffic: 14722944.0 bits (11.54% smaller)
Current iteration: 283 (temperature: 71.83) -- Latency: 20767968.0 (0.86% faster), Energy: 5931.21753344 uW (-6.14% lower), Add Mem Traffic: 14753664.0 bits (11.36% smaller)
Current iteration: 284 (temperature: 71.73) -- Latency: 20767968.0 (0.86% faster), Energy: 5931.21753344 uW (-6.14% lower), Add Mem Traffic: 14753664.0 bits (11.36% smaller)
Current iteration: 285 (temperature: 71.63) -- Latency: 20845824.0 (0.49% faster), Energy: 5933.584724480001 uW (-6.18% lower), Add Mem Traffic: 14722432.0 bits (11.54% smaller)
Current iteration: 286 (temperature: 71.53) -- Latency: 20845824.0 (0.49% faster), Energy: 5933.72472448 uW (-6.19% lower), Add Mem Traffic: 14722432.0 bits (11.54% smaller)
Current iteration: 287 (temperature: 71.43) -- Latency: 20684904.0 (1.26% faster), Energy: 5925.62793856 uW (-6.04% lower), Ad

Current iteration: 329 (temperature: 67.23) -- Latency: 20683914.0 (1.27% faster), Energy: 5935.453206399998 uW (-6.22% lower), Add Mem Traffic: 14707648.0 bits (11.63% smaller)
Current iteration: 330 (temperature: 67.13) -- Latency: 20683914.0 (1.27% faster), Energy: 5935.423206399998 uW (-6.22% lower), Add Mem Traffic: 14707648.0 bits (11.63% smaller)
Current iteration: 331 (temperature: 67.03) -- Latency: 20810058.0 (0.66% faster), Energy: 5948.242192639998 uW (-6.45% lower), Add Mem Traffic: 14845888.0 bits (10.80% smaller)
Current iteration: 332 (temperature: 66.93) -- Latency: 20810058.0 (0.66% faster), Energy: 5948.242192639998 uW (-6.45% lower), Add Mem Traffic: 14845888.0 bits (10.80% smaller)
Current iteration: 333 (temperature: 66.83) -- Latency: 20810058.0 (0.66% faster), Energy: 5948.312192639999 uW (-6.45% lower), Add Mem Traffic: 14845888.0 bits (10.80% smaller)
Current iteration: 334 (temperature: 66.73) -- Latency: 20810058.0 (0.66% faster), Energy: 5948.622192639999 u

Current iteration: 376 (temperature: 62.54) -- Latency: 20617230.0 (1.58% faster), Energy: 5765.397048960002 uW (-3.17% lower), Add Mem Traffic: 12712192.0 bits (23.62% smaller)
Current iteration: 377 (temperature: 62.44) -- Latency: 20616606.0 (1.59% faster), Energy: 5750.114215680002 uW (-2.90% lower), Add Mem Traffic: 12700160.0 bits (23.69% smaller)
Current iteration: 378 (temperature: 62.34) -- Latency: 20617230.0 (1.58% faster), Energy: 5765.467048960002 uW (-3.17% lower), Add Mem Traffic: 12712192.0 bits (23.62% smaller)
Current iteration: 379 (temperature: 62.24) -- Latency: 20526942.0 (2.01% faster), Energy: 5757.016353920002 uW (-3.02% lower), Add Mem Traffic: 13056640.0 bits (21.55% smaller)
Current iteration: 380 (temperature: 62.14) -- Latency: 20539326.0 (1.96% faster), Energy: 5752.051854080002 uW (-2.93% lower), Add Mem Traffic: 13139072.0 bits (21.06% smaller)
Current iteration: 381 (temperature: 62.04) -- Latency: 20533086.0 (1.99% faster), Energy: 5754.202099200002 u

Current iteration: 423 (temperature: 57.84) -- Latency: 20711340.0 (1.13% faster), Energy: 5906.4384947200015 uW (-5.70% lower), Add Mem Traffic: 13142016.0 bits (21.04% smaller)
Current iteration: 424 (temperature: 57.74) -- Latency: 20711340.0 (1.13% faster), Energy: 5909.116644320002 uW (-5.75% lower), Add Mem Traffic: 13147328.0 bits (21.01% smaller)
Current iteration: 425 (temperature: 57.64) -- Latency: 20711340.0 (1.13% faster), Energy: 5909.116644320002 uW (-5.75% lower), Add Mem Traffic: 13147328.0 bits (21.01% smaller)
Current iteration: 426 (temperature: 57.54) -- Latency: 20710536.0 (1.14% faster), Energy: 5908.258918880002 uW (-5.73% lower), Add Mem Traffic: 13147328.0 bits (21.01% smaller)
Current iteration: 427 (temperature: 57.44) -- Latency: 20710578.0 (1.14% faster), Energy: 5875.785433440003 uW (-5.15% lower), Add Mem Traffic: 13140544.0 bits (21.05% smaller)
Current iteration: 428 (temperature: 57.34) -- Latency: 20710578.0 (1.14% faster), Energy: 5875.805433440002 

Current iteration: 469 (temperature: 53.25) -- Latency: 20744274.0 (0.98% faster), Energy: 5937.154296960003 uW (-6.25% lower), Add Mem Traffic: 13243520.0 bits (20.43% smaller)
Current iteration: 470 (temperature: 53.15) -- Latency: 20758386.0 (0.91% faster), Energy: 5906.654953600003 uW (-5.70% lower), Add Mem Traffic: 13335168.0 bits (19.88% smaller)
Current iteration: 471 (temperature: 53.05) -- Latency: 20758386.0 (0.91% faster), Energy: 5906.654953600003 uW (-5.70% lower), Add Mem Traffic: 13335168.0 bits (19.88% smaller)
Current iteration: 472 (temperature: 52.95) -- Latency: 20758386.0 (0.91% faster), Energy: 5906.654953600003 uW (-5.70% lower), Add Mem Traffic: 13335168.0 bits (19.88% smaller)
Current iteration: 473 (temperature: 52.85) -- Latency: 20758386.0 (0.91% faster), Energy: 5906.654953600003 uW (-5.70% lower), Add Mem Traffic: 13335168.0 bits (19.88% smaller)
Current iteration: 474 (temperature: 52.75) -- Latency: 20758962.0 (0.91% faster), Energy: 5911.208868480003 u

Current iteration: 516 (temperature: 48.55) -- Latency: 20464320.0 (2.31% faster), Energy: 5967.0601144 uW (-6.78% lower), Add Mem Traffic: 14206592.0 bits (14.64% smaller)
Current iteration: 517 (temperature: 48.45) -- Latency: 20459520.0 (2.34% faster), Energy: 5966.99064816 uW (-6.78% lower), Add Mem Traffic: 14190336.0 bits (14.74% smaller)
Current iteration: 518 (temperature: 48.35) -- Latency: 20376000.0 (2.73% faster), Energy: 6007.421116 uW (-7.50% lower), Add Mem Traffic: 14245632.0 bits (14.41% smaller)
Current iteration: 519 (temperature: 48.25) -- Latency: 20377728.0 (2.73% faster), Energy: 5998.83541872 uW (-7.35% lower), Add Mem Traffic: 14268672.0 bits (14.27% smaller)
Current iteration: 520 (temperature: 48.15) -- Latency: 20377728.0 (2.73% faster), Energy: 5984.33497904 uW (-7.09% lower), Add Mem Traffic: 14260480.0 bits (14.32% smaller)
Current iteration: 521 (temperature: 48.05) -- Latency: 20377728.0 (2.73% faster), Energy: 5984.40497904 uW (-7.09% lower), Add Mem T

Current iteration: 563 (temperature: 43.86) -- Latency: 20511786.0 (2.09% faster), Energy: 5832.842036000001 uW (-4.38% lower), Add Mem Traffic: 14907008.0 bits (10.43% smaller)
Current iteration: 564 (temperature: 43.76) -- Latency: 20511774.0 (2.09% faster), Energy: 5835.1408648000015 uW (-4.42% lower), Add Mem Traffic: 14907392.0 bits (10.43% smaller)
Current iteration: 565 (temperature: 43.66) -- Latency: 20476380.0 (2.26% faster), Energy: 5829.171851040002 uW (-4.31% lower), Add Mem Traffic: 14900096.0 bits (10.48% smaller)
Current iteration: 566 (temperature: 43.56) -- Latency: 20477340.0 (2.25% faster), Energy: 5832.398375840003 uW (-4.37% lower), Add Mem Traffic: 14905216.0 bits (10.44% smaller)
Current iteration: 567 (temperature: 43.46) -- Latency: 20477340.0 (2.25% faster), Energy: 5832.398375840003 uW (-4.37% lower), Add Mem Traffic: 14905216.0 bits (10.44% smaller)
Current iteration: 568 (temperature: 43.36) -- Latency: 20536512.0 (1.97% faster), Energy: 5836.206049440003 

Current iteration: 610 (temperature: 39.16) -- Latency: 20416386.0 (2.54% faster), Energy: 5856.421931520001 uW (-4.80% lower), Add Mem Traffic: 14151936.0 bits (14.97% smaller)
Current iteration: 611 (temperature: 39.06) -- Latency: 20416386.0 (2.54% faster), Energy: 5856.421931520001 uW (-4.80% lower), Add Mem Traffic: 14151936.0 bits (14.97% smaller)
Current iteration: 612 (temperature: 38.96) -- Latency: 20410626.0 (2.57% faster), Energy: 5854.91134848 uW (-4.78% lower), Add Mem Traffic: 14020928.0 bits (15.76% smaller)
Current iteration: 613 (temperature: 38.86) -- Latency: 20410626.0 (2.57% faster), Energy: 5854.71134848 uW (-4.77% lower), Add Mem Traffic: 14020928.0 bits (15.76% smaller)
Current iteration: 614 (temperature: 38.76) -- Latency: 20427762.0 (2.49% faster), Energy: 5857.407374080001 uW (-4.82% lower), Add Mem Traffic: 14256704.0 bits (14.34% smaller)
Current iteration: 615 (temperature: 38.66) -- Latency: 20427762.0 (2.49% faster), Energy: 5857.307374080002 uW (-4.82

Current iteration: 657 (temperature: 34.47) -- Latency: 20531466.0 (1.99% faster), Energy: 5820.2886822400005 uW (-4.16% lower), Add Mem Traffic: 14074624.0 bits (15.44% smaller)
Current iteration: 658 (temperature: 34.37) -- Latency: 20531466.0 (1.99% faster), Energy: 5820.2886822400005 uW (-4.16% lower), Add Mem Traffic: 14074624.0 bits (15.44% smaller)
Current iteration: 659 (temperature: 34.27) -- Latency: 20518410.0 (2.06% faster), Energy: 5822.51242368 uW (-4.20% lower), Add Mem Traffic: 14050048.0 bits (15.58% smaller)
Current iteration: 660 (temperature: 34.17) -- Latency: 20518410.0 (2.06% faster), Energy: 5822.51242368 uW (-4.20% lower), Add Mem Traffic: 14050048.0 bits (15.58% smaller)
Current iteration: 661 (temperature: 34.07) -- Latency: 20517414.0 (2.06% faster), Energy: 5822.2843276799995 uW (-4.19% lower), Add Mem Traffic: 14066432.0 bits (15.48% smaller)
Current iteration: 662 (temperature: 33.97) -- Latency: 20522022.0 (2.04% faster), Energy: 5816.86197696 uW (-4.09%

Current iteration: 704 (temperature: 29.77) -- Latency: 20713650.0 (1.12% faster), Energy: 5739.766646400002 uW (-2.71% lower), Add Mem Traffic: 15244096.0 bits (8.41% smaller)
Current iteration: 705 (temperature: 29.67) -- Latency: 20713650.0 (1.12% faster), Energy: 5739.576646400002 uW (-2.71% lower), Add Mem Traffic: 15244096.0 bits (8.41% smaller)
Current iteration: 706 (temperature: 29.57) -- Latency: 20713650.0 (1.12% faster), Energy: 5739.576646400002 uW (-2.71% lower), Add Mem Traffic: 15244096.0 bits (8.41% smaller)
Current iteration: 707 (temperature: 29.47) -- Latency: 20713650.0 (1.12% faster), Energy: 5739.376646400002 uW (-2.71% lower), Add Mem Traffic: 15244096.0 bits (8.41% smaller)
Current iteration: 708 (temperature: 29.37) -- Latency: 21318066.0 (-1.76% faster), Energy: 5753.224600960001 uW (-2.96% lower), Add Mem Traffic: 15199040.0 bits (8.68% smaller)
Current iteration: 709 (temperature: 29.27) -- Latency: 21316722.0 (-1.76% faster), Energy: 5749.8366252800015 uW 

Current iteration: 750 (temperature: 25.17) -- Latency: 21233412.0 (-1.36% faster), Energy: 5781.855136640002 uW (-3.47% lower), Add Mem Traffic: 14136448.0 bits (15.06% smaller)
Current iteration: 751 (temperature: 25.08) -- Latency: 21233412.0 (-1.36% faster), Energy: 5781.855136640002 uW (-3.47% lower), Add Mem Traffic: 14136448.0 bits (15.06% smaller)
Current iteration: 752 (temperature: 24.98) -- Latency: 21231492.0 (-1.35% faster), Energy: 5781.353321600001 uW (-3.46% lower), Add Mem Traffic: 14092800.0 bits (15.33% smaller)
Current iteration: 753 (temperature: 24.88) -- Latency: 21231492.0 (-1.35% faster), Energy: 5786.515931520001 uW (-3.55% lower), Add Mem Traffic: 14094848.0 bits (15.31% smaller)
Current iteration: 754 (temperature: 24.78) -- Latency: 21231492.0 (-1.35% faster), Energy: 5786.515931520001 uW (-3.55% lower), Add Mem Traffic: 14094848.0 bits (15.31% smaller)
Current iteration: 755 (temperature: 24.68) -- Latency: 21208872.0 (-1.24% faster), Energy: 5802.12211136

Current iteration: 797 (temperature: 20.48) -- Latency: 20292174.0 (3.14% faster), Energy: 5871.450693760002 uW (-5.07% lower), Add Mem Traffic: 12484352.0 bits (24.99% smaller)
Current iteration: 798 (temperature: 20.38) -- Latency: 20292174.0 (3.14% faster), Energy: 5871.400693760002 uW (-5.07% lower), Add Mem Traffic: 12484352.0 bits (24.99% smaller)
Current iteration: 799 (temperature: 20.28) -- Latency: 20292174.0 (3.14% faster), Energy: 5873.363303680002 uW (-5.11% lower), Add Mem Traffic: 12486400.0 bits (24.98% smaller)
Current iteration: 800 (temperature: 20.18) -- Latency: 20291538.0 (3.14% faster), Energy: 5858.076231040001 uW (-4.83% lower), Add Mem Traffic: 12473984.0 bits (25.05% smaller)
Current iteration: 801 (temperature: 20.08) -- Latency: 20319282.0 (3.01% faster), Energy: 5824.286839040001 uW (-4.23% lower), Add Mem Traffic: 12624000.0 bits (24.15% smaller)
Current iteration: 802 (temperature: 19.98) -- Latency: 20319282.0 (3.01% faster), Energy: 5824.286839040001 u

Current iteration: 844 (temperature: 15.78) -- Latency: 20390154.0 (2.67% faster), Energy: 5846.971650880002 uW (-4.63% lower), Add Mem Traffic: 14226816.0 bits (14.52% smaller)
Current iteration: 845 (temperature: 15.68) -- Latency: 20407434.0 (2.58% faster), Energy: 5851.508537920003 uW (-4.71% lower), Add Mem Traffic: 14619904.0 bits (12.16% smaller)
Current iteration: 846 (temperature: 15.58) -- Latency: 20490954.0 (2.19% faster), Energy: 5811.078070080002 uW (-3.99% lower), Add Mem Traffic: 14564608.0 bits (12.49% smaller)
Current iteration: 847 (temperature: 15.48) -- Latency: 20524554.0 (2.03% faster), Energy: 5813.752461120002 uW (-4.04% lower), Add Mem Traffic: 14805248.0 bits (11.05% smaller)
Current iteration: 848 (temperature: 15.38) -- Latency: 20524554.0 (2.03% faster), Energy: 5813.822461120002 uW (-4.04% lower), Add Mem Traffic: 14805248.0 bits (11.05% smaller)
Current iteration: 849 (temperature: 15.28) -- Latency: 20532234.0 (1.99% faster), Energy: 5814.202157760002 u

Current iteration: 891 (temperature: 11.09) -- Latency: 20509602.0 (2.10% faster), Energy: 5785.755460479998 uW (-3.54% lower), Add Mem Traffic: 14664832.0 bits (11.89% smaller)
Current iteration: 892 (temperature: 10.99) -- Latency: 20562666.0 (1.84% faster), Energy: 5780.598172799998 uW (-3.45% lower), Add Mem Traffic: 14677504.0 bits (11.81% smaller)
Current iteration: 893 (temperature: 10.89) -- Latency: 20473674.0 (2.27% faster), Energy: 5786.658416639997 uW (-3.55% lower), Add Mem Traffic: 14174464.0 bits (14.84% smaller)
Current iteration: 894 (temperature: 10.79) -- Latency: 20473674.0 (2.27% faster), Energy: 5786.658416639997 uW (-3.55% lower), Add Mem Traffic: 14174464.0 bits (14.84% smaller)
Current iteration: 895 (temperature: 10.69) -- Latency: 20473674.0 (2.27% faster), Energy: 5790.489721599997 uW (-3.62% lower), Add Mem Traffic: 14175488.0 bits (14.83% smaller)
Current iteration: 896 (temperature: 10.59) -- Latency: 20435940.0 (2.45% faster), Energy: 5783.627531519997 u

Current iteration: 935 (temperature: 6.69) -- Latency: 20366022.0 (2.78% faster), Energy: 5811.50253104 uW (-4.00% lower), Add Mem Traffic: 14817600.0 bits (10.97% smaller)
Current iteration: 936 (temperature: 6.59) -- Latency: 20366022.0 (2.78% faster), Energy: 5811.50253104 uW (-4.00% lower), Add Mem Traffic: 14817600.0 bits (10.97% smaller)
Current iteration: 937 (temperature: 6.49) -- Latency: 20347590.0 (2.87% faster), Energy: 5809.1519787199995 uW (-3.96% lower), Add Mem Traffic: 14620992.0 bits (12.15% smaller)
Current iteration: 938 (temperature: 6.39) -- Latency: 20347590.0 (2.87% faster), Energy: 5809.1519787199995 uW (-3.96% lower), Add Mem Traffic: 14620992.0 bits (12.15% smaller)
Current iteration: 939 (temperature: 6.29) -- Latency: 20347590.0 (2.87% faster), Energy: 5809.18197872 uW (-3.96% lower), Add Mem Traffic: 14620992.0 bits (12.15% smaller)
Current iteration: 940 (temperature: 6.19) -- Latency: 20347590.0 (2.87% faster), Energy: 5809.18197872 uW (-3.96% lower), Ad

Current iteration: 978 (temperature: 2.40) -- Latency: 20262924.0 (3.27% faster), Energy: 5691.756156160001 uW (-1.86% lower), Add Mem Traffic: 14495552.0 bits (12.91% smaller)
Current iteration: 979 (temperature: 2.30) -- Latency: 20262924.0 (3.27% faster), Energy: 5691.756156160001 uW (-1.86% lower), Add Mem Traffic: 14495552.0 bits (12.91% smaller)
Found best so far:  [3, 1, 5, 2, 1, 1, 3, 2, 5, 3, 3, 4, 5, 4, 4, 6, 3, 5, 6, 4, 4, 4, 1, 4, 1, 3, 2, 1, 5, 5, 1, 6, 1, 1, 1, 5, 2, 4, 5, 4, 5, 2, 3, 1, 6, 4, 6, 6, 1, 5, 3, 1, 5]  .. updating cost_best: 115260174744.62962
Current iteration: 980 (temperature: 2.20) -- Latency: 20261964.0 (3.28% faster), Energy: 5688.499631360001 uW (-1.80% lower), Add Mem Traffic: 14490432.0 bits (12.94% smaller)
Current iteration: 981 (temperature: 2.10) -- Latency: 20261964.0 (3.28% faster), Energy: 5688.499631360001 uW (-1.80% lower), Add Mem Traffic: 14490432.0 bits (12.94% smaller)
Found best so far:  [3, 1, 5, 2, 1, 1, 3, 2, 5, 3, 3, 4, 5, 4, 4, 6, 

Copy the best states

In [11]:
with open(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'SA_{}_state.yaml'.format('linear')), 'r') as f:
    states = yaml.safe_load(f)
    best_state = states['best']

# move the best solution result
for layer_idx in range(1, n_layers + 1):
    loopnest_id = best_state[layer_idx - 1]
    src = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer_idx), \
                       'eval{}'.format(loopnest_id))
    src_files = os.listdir(src)
    for file in src_files:
        file_name = os.path.join(src, file)
        if os.path.isfile(file_name):
            shutil.copy(file_name, os.path.join(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer_idx))))
            

Generate stats.csv

In [14]:
from authblock_assignment import AuthBlockAssignment

AuthBlockAssignment(n_layers, layer_info, \
                    base_dir, timeloop_dir, top_dir, sub_dir, \
                    configuration_dict, \
                    mode="search", \
                    joint=True, generate_summary=True, return_cost_dict=False)

In [None]:
cost, rehash, _ = AuthBlockAssignment(n_layers, layer_info, \
                    base_dir, timeloop_dir, top_dir, sub_dir, \
                    configuration_dict, \
                    mode="search", \
                    joint=True, generate_summary=False, return_cost_dict=True)

In [None]:
for i in range(1, n_layers+1):
    print(cost[i]['timeloop_energy'], current_cost_dict[i]['timeloop_energy'])

In [None]:
for key in rehash.keys():
    print(rehash[key]['total_energy'])