## Run SecureLoop Scheduling

Run all three steps of scheduling and compare the effect of different scheduling algorithms (Fig. 11). For more detailed experiments for each step, check 1) `run_loopnest_scheduling.ipynb`, 2) `run_authblock_assignment.ipynb`, and 3) `run_simulated_annealing.ipynb`.

In [None]:
%load_ext autoreload
%autoreload 2

import os
import yaml
import shutil
from pathlib import Path

### 1. Define the accelerator architecture / parameters

First, define an architecture design. The code below generates/detects a new architecture configuration based on the template design at `designs/{design_name}/template`. We provide `eyeriss_like` architecture we used for experiments in the paper in the source code. 

In [None]:
from utils import generate_arch_files, xml2mapping 

configuration_dict = {}

# template design (with constraints and memory hierarchy representing "dataflow")
configuration_dict['TEMPLATE_DESIGN'] = 'eyeriss_like'

# number of bits used for I/O/W; we assume integer
configuration_dict['WORDBITS'] = 16

# DRAM bandwidth setting: words / cycle (not bits / cycle)
configuration_dict['DRAM_READ_BANDWIDTH'] = 32
configuration_dict['DRAM_WRITE_BANDWIDTH'] = 32

# SRAM setting
# - do we have a single shared glb or multiple glbs for each datatype? 
# - for each glb (if shared, just one), define depth/width/#banks and bandwidths
configuration_dict['SRAM_SHARED'] = True
configuration_dict['SRAM_DEPTH'] = [2 ** 13]
configuration_dict['SRAM_WIDTH'] = [2 ** 7]
configuration_dict['SRAM_BANKS'] = [32]                     # SRAM width and SRAM banks define the maximum possible bandwidth
configuration_dict['SRAM_READ_BANDWIDTH'] = [32]
configuration_dict['SRAM_WRITE_BANDWIDTH'] = [32]

# PE array setting
# - shape of PE array X x Y
# - whether a PE has a shared scratchpad or separate scratchpads for each datatype
configuration_dict['PE_X'] = 14
configuration_dict['PE_Y'] = 12
configuration_dict['PE_SPAD_SHARED'] = False
configuration_dict['PE_SPAD_DEPTH'] = [192, 12, 16]         # Weight, IFmap, OFmap
configuration_dict['PE_SPAD_WIDTH'] = [16, 16, 16]

# Cryptographic engine setting
# - type of cryptographic engine + dram (LPDDR4 + AES-GCM)
# - cycle for AES-GCM 
# - whether the cryptographic engines are shared among all datatypes or assigned to each datatype
configuration_dict['CRYPT_ENGINE_TYPE'] = 'effective_lpddr4_aesgcm'
configuration_dict['CRYPT_ENGINE_CYCLE_PER_BLOCK'] = 11            # avg. cycle/128bit

configuration_dict['CRYPT_ENGINE_SHARED'] = False
configuration_dict['CRYPT_ENGINE_COUNT'] = [1, 1, 1]

configuration_dict['EFFECTIVE_CONSERVATIVE'] = True

# Create directory for this configuration if it doesn't exist already
# iterate through design folders to check if any pre-exisiting folder
design_dir = 'designs/{}'.format(configuration_dict['TEMPLATE_DESIGN'])
arch_dir = None
total_vers = 0
for path in os.listdir(design_dir):
    if path != 'template' and os.path.isdir(os.path.join(design_dir, path)):
        try:
            with open(os.path.join(design_dir, path, 'config.yaml'), 'r') as f:
                config_file = yaml.safe_load(f)
            total_vers += 1
            if config_file == configuration_dict:
                arch_dir = path
                print("Pre-existing folder found. Setting the arch_dir to {}".format(arch_dir))
                break
        except:
            print("No config.yaml file in the directory {}".format(str(os.path.join(design_dir, path))))
            
if arch_dir is None:
    arch_dir = 'ver{}'.format(total_vers)
    shutil.copytree(os.path.join(design_dir, 'template'), os.path.join(design_dir, arch_dir))
    with open(os.path.join(design_dir, arch_dir, 'config.yaml'), 'w') as f:
        _ = yaml.dump(configuration_dict, f)
    
    # create baseline and effective files
    generate_arch_files(os.path.join(design_dir, arch_dir, 'arch'), configuration_dict)
    
    # create scheduling / evaluation folder
    os.mkdir(os.path.join(design_dir, arch_dir, 'scheduling'))
    os.mkdir(os.path.join(design_dir, arch_dir, 'evaluation'))
    
    # create folders for baseline scheduling / evaluation
    os.mkdir(os.path.join(design_dir, arch_dir, 'baseline_scheduling'))
    os.mkdir(os.path.join(design_dir, arch_dir, 'baseline_evaluation'))

### 2. Define the DNN workload

Define a workload as PyTorch's torch.nn.module. Then, convert the workload to the Timeloop's workload format and extract unique layers and interlayer dependency information.

#### 2-1. Convert PyTorch workload to Timeloop workload (< 1 min)

Define your workload. We used AlexNet, Resnet18, and MobilenetV2 for experiments. **Comment in/out the model you want to test, or define your own model here.**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision.models as model_zoo

import pytorch2timeloop as pytorch2timeloop

# Note: this version only supports nn.Conv2d (both normal convs and depthwise/pointwise convs) and nn.Linear

# AlexNet
# model_name = 'alexnet'
# net = model_zoo.alexnet(pretrained=False)
# layers_exclude_from_search = [6, 7, 8] # Define layer idx if you don't want to search them for simulated anneling (e.g., non-conv layers in AlexNet)

# ResNet18
# model_name = 'resnet18'
# net = model_zoo.resnet18(pretrained=False)
# layers_exclude_from_search = []

# MobilenetV2
model_name = 'mobilenet_v2'
net = model_zoo.mobilenet_v2(pretrained=False)
layers_exclude_from_search = []

# Input / Batch info
input_size = (3, 224, 224)
batch_size = 1

# print(net)

# Convert to timeloop workloads; stored in workloads/{model_name}_batch{batch_size}
top_dir = 'workloads'
sub_dir = '{}_batch{}'.format(model_name, batch_size)
exception_module_names = []

overwrite = False
if not os.path.exists(os.path.join(top_dir, sub_dir)) or overwrite:
    pytorch2timeloop.convert_model(
            net,
            input_size,
            batch_size,
            sub_dir,
            top_dir,
            True,
            exception_module_names
        )

#### 2-2. Extract unique layers (per-layer Timeloop loopnest scheduling only for unqiue layers) (< 1 min)

In [None]:
base_dir = Path(os.getcwd())
timeloop_dir = 'designs/{}/{}'.format(configuration_dict['TEMPLATE_DESIGN'], arch_dir)

n_layers = 0
layer_dict = {}
layer_duplicate_info = {}
unique_layers = []
for module in net.modules():
    if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
        n_layers += 1
        if n_layers not in layer_dict.keys():
            workload_path = os.path.join(base_dir, top_dir, sub_dir, '{}_layer{}.yaml'.format(sub_dir, n_layers))
            with open(workload_path, 'r') as f:
                workload_info = yaml.safe_load(f)
            layer_dict[n_layers] = workload_info
        
        # identify the earliest duplicate layer
        for key in range(1, n_layers):
            if layer_dict[key] == layer_dict[n_layers]:
                layer_duplicate_info[n_layers] = key
                break
        if n_layers not in layer_duplicate_info:
            unique_layers.append(n_layers)
            
# print(layer_duplicate_info)
# print(unique_layers)

#### 2-3. Extract interlayer dependency (1~10 min if running for the first time; < 1 min if saved yaml file is available)

We need to take care of interlayer dependencies. Since the layer number does not imply the actual layer order/dependency, and often multiple layers can be dependent on one layer and vice versa (e.g., residual connections), we create a back-propagation graph and determine the dependency from the graph. 

Depending on your accelerator architecture assumptions, some post-processing operations can be performed on-the-fly. Here, we assume that ReLU activation and batch normalization can be performed on-the-fly (thus, not affect the dependency), while pooling operations and adding multiple feature maps together (e.g., adding residual branches together) cannot be done on-the-fly (thus, break the depenency).

If you have different assumptions about the post-processing or using different versions of PyTorch with different backend operations, `BackpropGraph.isDependentLayer` has to be modified accordingly.

Define whether you want to ignore the interlayer dependency entirely (i.e., always use rehashing). Then, the dependency dictionary will ignore all interlayer dependency. 

Finally, constructing and analyzing the back-propagation grpah can take > 5 minutes for deep models like MobilenetV2. Since this dictionary has to be constructed only once per each model, we first search for existing dictionary. If not, then we generate a graph and create a dictionary and save it. 

*We provide the layer_info.yaml files in `workloads` - if you want to generate your own info, remove those yaml files*

In [None]:
from pytorch_layer_dependency_utils import BackpropGraph

workload_path_1 = os.path.join(base_dir, top_dir, sub_dir, 'layer_info_interlayer.yaml')
workload_path_2 = os.path.join(base_dir, top_dir, sub_dir, 'layer_info_ignore_interlayer.yaml') # we also need this for baseline

try:
    with open(workload_path_1, 'r') as f:
        layer_info = yaml.safe_load(f)
    with open(workload_path_2, 'r') as f:
        layer_info_ignore_interlayer = yaml.safe_load(f)
        
except:

    graph = BackpropGraph(net, [1, input_size[0], input_size[1], input_size[2]])
    consecutive_dict, dependent_dict = graph.get_dependency_info()

    # construct layer_info
    layer_info = {}
    layer_info_ignore_interlayer = {}
    for layer_idx in range(1, n_layers + 1):
        info = {}
        if layer_idx in unique_layers:
            info['layer_id_for_timeloop'] = layer_idx
        else:
            info['layer_id_for_timeloop'] = layer_duplicate_info[layer_idx]
        info['prev_layer'] = []
        info['next_layer'] = []
        info['dependent_prev_layer'] = []
        info['dependent_next_layer'] = []
        layer_info[layer_idx] = info
        layer_info_ignore_interlayer[layer_idx] = info

    for layer_idx in range(1, n_layers + 1):
        consecutive = consecutive_dict[layer_idx]
        dependent = dependent_dict[layer_idx]
        layer_info[layer_idx]['next_layer'].extend(consecutive)
        layer_info_ignore_interlayer[layer_idx]['next_layer'].extend(consecutive)
        for i in consecutive:
            layer_info[i]['prev_layer'].append(layer_idx)
            layer_info_ignore_interlayer[i]['prev_layer'].append(layer_idx)
        if len(dependent) > 0 and not ignore_interlayer:
            layer_info[layer_idx]['dependent_next_layer'].extend(dependent)   
            for i in dependent:
                layer_info[i]['dependent_prev_layer'].append(layer_idx)

    # for layer_idx in range(1, n_layers + 1):
    #     print(layer_idx, layer_info[layer_idx])

    # store therresults - this can take long for deep models like MobileNet..
    # f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_id}.yaml "
    with open(workload_path_1, 'w') as f:
        _ = yaml.dump(layer_info, f)
    with open(workload_path_2, 'w') as f:
        _ = yaml.dump(layer_info_ignore_interlayer, f)


### 3. Define the top-k parameter and run timeloop-topk using the effective model (Step 1)

In [None]:
topk = 6
mapper_file_path = os.path.join(base_dir, timeloop_dir, 'mapper/mapper.yaml')
with open(mapper_file_path, 'r') as f:
    mapper_config = yaml.safe_load(f)
mapper_config['mapper']['topk'] = topk
with open(mapper_file_path, 'w') as f:
    _ = yaml.dump(mapper_config, f)

#### 3-1. Run timeloop-topk for all unique layers (~30 min)

In [None]:
import time

def get_cmd(workload_info, layer_id, base_dir, timeloop_dir, sub_dir, top_dir):
    cwd = f"{base_dir/timeloop_dir/'scheduling'/sub_dir/f'layer{layer_id}'}"
    if 'M' in workload_info['problem']['instance']:
        constraint_pth = base_dir/timeloop_dir/'constraints/*.yaml'
    else:
        # depthwise
        constraint_pth = base_dir/timeloop_dir/'constraints_dw/*.yaml'

    timeloopcmd = f"timeloop-mapper-topk " \
                  f"{base_dir/timeloop_dir/'arch/effective.yaml'} " \
                  f"{base_dir/timeloop_dir/'arch/components/*.yaml'} " \
                  f"{base_dir/timeloop_dir/'mapper/mapper.yaml'} " \
                  f"{constraint_pth} " \
                  f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_id}.yaml "
    return [cwd, timeloopcmd]

cwd_list = []
cmd_list = []

for layer_id in unique_layers:
    workload_path = os.path.join(base_dir, top_dir, sub_dir, '{}_layer{}.yaml'.format(sub_dir, layer_id))
    with open(workload_path, 'r') as f:
        workload_info = yaml.safe_load(f)
    [cwd, cmd] = get_cmd(workload_info, layer_id, base_dir, timeloop_dir, sub_dir, top_dir)
    cwd_list.append(cwd)
    cmd_list.append(cmd)
    
if not os.path.exists(os.path.join(base_dir, timeloop_dir, 'scheduling', sub_dir)):
    os.mkdir(os.path.join(base_dir, timeloop_dir, 'scheduling', sub_dir))
    
start_time = time.time()
for cwd, cmd in zip(cwd_list, cmd_list):
    print("Executing cmd: {}".format(cmd))
    try:
        os.chdir(cwd)
    except:
        os.mkdir(cwd)
        os.chdir(cwd)
    os.system(cmd)
os.chdir(base_dir)

# Time this cell
print("Execution time: {}s".format(time.time() - start_time))

#### 3-2. Convert the found schedule to yaml files (~ 1 min)

In [None]:
def convert_to_mapping(base_dir, timeloop_dir, top_dir, sub_dir, layer_idx, topk_idx):
    xml_file = os.path.join(base_dir, timeloop_dir, 'scheduling', sub_dir, "layer{}".format(layer_idx), \
                            "timeloop-mapper-topk{}.map+stats.xml".format(topk_idx))
    workload_file = os.path.join(base_dir, top_dir, sub_dir, "{}_layer{}.yaml".format(sub_dir, layer_idx))
    # print(workload_file)
    with open(workload_file, 'r') as f:
        workload_info = yaml.safe_load(f)
    if 'M' in workload_info['problem']['instance']:
        dw = False
    else:
        dw = True
    arch_constraint_file = os.path.join(base_dir, timeloop_dir, 'constraints_dw' if dw else 'constraints' , \
                                        'eyeriss_like_arch_constraints.yaml' if (configuration_dict['TEMPLATE_DESIGN'] == 'eyeriss_like' \
                                                                                 or configuration_dict['TEMPLATE_DESIGN'] == 'eyeriss_like_hbm2') \
                                        else 'simple_output_stationary_arch_constraints.yaml' if configuration_dict['TEMPLATE_DESIGN'] == 'output_stationary' \
                                        else 'simple_weight_stationary_arch_constraints.yaml')
    # print(layer_idx, dw)
    mapping = xml2mapping(xml_file, workload_file, arch_constraint_file, dw)
    with open(os.path.join(base_dir, timeloop_dir, 'scheduling',sub_dir, "layer{}".format(layer_idx), \
                           "mapping{}.yaml".format(topk_idx)), 'w') as f:
        _ = yaml.dump({'mapping': mapping}, f)
        
for layer_idx in unique_layers:
    for k in range(1, topk + 1):
        convert_to_mapping(base_dir, timeloop_dir, top_dir, sub_dir, layer_idx, k)

#### 3-3. Evaluate the top-1 loopnest schedule (< 5 min)

Note that we use the effective off-chip bandwidth model for **scheduling**, but the actual energy and latency of the accelerator (excluding the cryptographic engine) has to evaluated with the baseline model. The actual cost of cryptographic operations is more complex (i.e., AuthBlock assignment) and is added in the next step. 

In [None]:
def get_cmd_model(workload_info, layer_id, base_dir, timeloop_dir, sub_dir, top_dir):
    cwd = f"{base_dir/timeloop_dir/'evaluation'/sub_dir/f'layer{layer_id}'}"
    if 'M' in workload_info['problem']['instance']:
        constraint_pth = base_dir/timeloop_dir/'constraints/*.yaml'
    else:
        # depthwise
        constraint_pth = base_dir/timeloop_dir/'constraints_dw/*.yaml'

    timeloopcmd = f"timeloop-model " \
                  f"{base_dir/timeloop_dir/'arch/baseline.yaml'} " \
                  f"{base_dir/timeloop_dir/'arch/components/*.yaml'} " \
                  f"{base_dir/timeloop_dir/'scheduling'/sub_dir/f'layer{layer_id}/mapping1.yaml'} " \
                  f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_id}.yaml "
    return [cwd, timeloopcmd]

cwd_list = []
cmd_list = []
for layer_id in unique_layers:
    workload_path = os.path.join(base_dir, top_dir, sub_dir, '{}_layer{}.yaml'.format(sub_dir, layer_id))
    with open(workload_path, 'r') as f:
        workload_info = yaml.safe_load(f)
    [cwd, cmd] = get_cmd_model(workload_info, layer_id, base_dir, timeloop_dir, sub_dir, top_dir)
    cwd_list.append(cwd)
    cmd_list.append(cmd)
    
if not os.path.exists(os.path.join(base_dir, timeloop_dir, 'evaluation', sub_dir)):
    os.mkdir(os.path.join(base_dir, timeloop_dir, 'evaluation', sub_dir))
for cwd, cmd in zip(cwd_list, cmd_list):
    print("Executing cmd: {}".format(cmd))
    try:
        os.chdir(cwd)
    except:
        os.mkdir(cwd)
        os.chdir(cwd)
    os.system(cmd)
os.chdir(base_dir)

### 4. AuthBlock Assignment (tile-as-an-AuthBlock vs. our algorithm)

#### 4-1. tile-as-an-AuthBlock using the per-layer top-1 loopnest schedule (*Crypt-Tile-Single*) (< 5 min)

In [None]:
from authblock_assignment import AuthBlockAssignment

cts_cost_dict, cts_rehash_cost_dict, cts_block_info_dict = \
AuthBlockAssignment(n_layers, layer_info_ignore_interlayer, \
                    base_dir, timeloop_dir, top_dir, sub_dir, \
                    configuration_dict, mode="tile", \
                    joint=False, generate_summary=True, return_cost_dict=True)

#### 4-2. Our optimal AuthBlock assignment algorithm using the per-layer top-1 loopnest schedule (*Crypt-Opt-Single*) (< 5 min)

In [None]:
from authblock_assignment import AuthBlockAssignment

cos_cost_dict, cos_rehash_cost_dict, cos_block_info_dict = \
AuthBlockAssignment(n_layers, layer_info, \
                    base_dir, timeloop_dir, top_dir, sub_dir, \
                    configuration_dict, mode="search", \
                    joint=False, generate_summary=True, return_cost_dict=True)

### 5. Simulated annealing for interlayer dependency 

Run simulated annealing to identify the optimal loopnest schedule when multiple layers are jointly explored. 

#### 5-1. Prepare folders before running simulated annealing (< 5 min)

We have to evaluate all top-k loopnest schedules for reference.

In [None]:
from authblock_assignment import AuthBlockAssignment

if not os.path.exists(os.path.join(base_dir, timeloop_dir, 'joint_topk')):
    os.mkdir(os.path.join(base_dir, timeloop_dir, 'joint_topk'))
if not os.path.exists(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir)):
    os.mkdir(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir))
    
base_cost_dict, base_rehash_cost_dict, base_block_info_dict = AuthBlockAssignment(n_layers, layer_info, \
                                                                                  base_dir, timeloop_dir, top_dir, sub_dir, \
                                                                                  configuration_dict, \
                                                                                  mode="search", \
                                                                                  joint=False, return_cost_dict=True)

baseline_energy = 0
baseline_latency = 0
baseline_add_mem_traffic = 0

for key in base_cost_dict:
    baseline_energy += base_cost_dict[key]['total_energy'] / 10**6
    baseline_latency += base_cost_dict[key]['total_latency']
    baseline_add_mem_traffic += base_cost_dict[key]['add_memory_traffic']
for key in base_rehash_cost_dict:
    baseline_energy += base_rehash_cost_dict[key]['total_energy'] / 10**6
    baseline_latency += base_rehash_cost_dict[key]['total_latency']
    baseline_add_mem_traffic += base_rehash_cost_dict[key]['add_memory_traffic']   
    
for layer_idx in range(1, n_layers + 1):
    work_dir = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer_idx))
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
        
    # """
    for k in range(1, topk + 1):
        if not os.path.exists(os.path.join(work_dir, 'eval{}'.format(k))):
            os.mkdir(os.path.join(work_dir, 'eval{}'.format(k)))
        layer_id_for_timeloop = layer_info[layer_idx]['layer_id_for_timeloop']
        cwd = f"{base_dir/timeloop_dir/'joint_topk'/sub_dir/f'layer{layer_idx}'/f'eval{k}'}"
        if 'M' in workload_info['problem']['instance']:
            constraint_pth = base_dir/timeloop_dir/'constraints/*.yaml'
        else:
            # depthwise
            constraint_pth = base_dir/timeloop_dir/'constraints_dw/*.yaml'

        timeloopcmd = f"timeloop-model " \
              f"{base_dir/timeloop_dir/'arch/baseline.yaml'} " \
              f"{base_dir/timeloop_dir/'arch/components/*.yaml'} " \
              f"{base_dir/timeloop_dir/'scheduling'/sub_dir/f'layer{layer_id_for_timeloop}'/f'mapping{k}.yaml'} " \
              f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_idx}.yaml "
        
        try:
            os.chdir(cwd)
        except:
            os.mkdir(cwd)
            os.chdir(cwd)
        os.system(timeloopcmd)
        os.chdir(base_dir)
    # """

    # copy mapping1's result into here
    shutil.copy(os.path.join(work_dir, 'eval1', 'timeloop-model.map+stats.xml'), work_dir)

#### 5-2. Run simulated annealing (~ 10 min for AlexNet/MobilenetV2; ~90 min ResNet18)

First, define the hyperparameters for simulated annealing. Then, run the algorithm.

For every iteration of simulated annealing, AuthBlockAssignment has to be executed to evaluate the cost. To make the search process faster, we support partial update of the AuthBlockAssignment (only calculate for changed layer and its dependent layers). However, there is currently some bugs with supporting the partial update for ResNet18 and we run the full AuthBlockAssignment for ResNet18 instead. As such, this cell takes much longer to execute for ResNet18. 

In [None]:
initial_temp = 100
final_temp = 0.1
n_iters = 1000

cooling_scheduler = 'linear'

# TODO: this option should not be used for ResNet18 - bug with dependent layer partial update due to residuals
use_partial_update = True
if model_name == 'resnet18':
    use_partial_update = False

In [None]:
import random
import time
import csv
import copy
import math

from authblock_assignment import PartialUpdateAuthBlockAssignment

csv_header = ['Iter', 'Temp', \
              'Cost (J x cycles)', 'Total Latency (cycles)', 'Total Energy (uJ)', 'Additional Off-chip Traffic (bits)']
logs = []

solution_cost_dict = copy.deepcopy(base_cost_dict)
solution_rehash_cost_dict = copy.deepcopy(base_rehash_cost_dict)
solution_block_info_dict = copy.deepcopy(base_block_info_dict)

current_cost_dict = copy.deepcopy(base_cost_dict)
current_rehash_cost_dict = copy.deepcopy(base_rehash_cost_dict)
current_block_info_dict = copy.deepcopy(base_block_info_dict)

solution_state = [1] * n_layers
current_state = [1] * n_layers
best_state = [1] * n_layers

i = 0
cost_best = baseline_energy * baseline_latency

layers_for_search = []
for idx in range(1, n_layers + 1):
    if len(layer_info[idx]['dependent_next_layer']) > 0 or len(layer_info[idx]['dependent_prev_layer']) > 0:
        if idx not in layers_exclude_from_search:
            layers_for_search.append(idx)
            
start_time = time.time()
while i < n_iters + 1:
    # temperature
    if cooling_scheduler == 'linear':
        current_temp = final_temp + (initial_temp - final_temp) / float(n_iters) * float(n_iters - i)
    elif cooling_scheduler == 'cosine':
        current_temp = final_temp + 0.5 * (initial_temp - final_temp) * (1 + math.cos(float(i) * math.pi / float(n_iters)))
    elif cooling_scheduler == 'quadratic':
        current_temp = final_temp + (initial_temp - final_temp) * (float(n_iters - i) / float(n_iters))**2
    
    layer2change = random.choice(layers_for_search)
    neighbor_loopnest = random.choice(list(range(1, topk + 1)))
    
    current_state[layer2change - 1] = neighbor_loopnest
    stats_file = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, "layer{}".format(layer2change), \
                              "eval{}".format(neighbor_loopnest), "timeloop-model.stats.txt")
    with open(stats_file, 'r') as f:
        lines = f.read().split('\n')[-200:]
        for line in lines:
            if line.startswith('Energy'):
                energy = eval(line.split(': ')[1].split(' ')[0]) * float(10**6) # micro to pico
                # print(energy)
            elif line.startswith('Cycles'):
                cycle = eval(line.split(': ')[1])
    current_cost_dict[layer2change]['timeloop_energy'] = energy
    current_cost_dict[layer2change]['timeloop_cycle'] = cycle
    
    xml_file = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, "layer{}".format(layer2change), \
                            "eval{}".format(neighbor_loopnest), "timeloop-model.map+stats.xml")
    shutil.copy(xml_file, os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer2change)))
    
    if use_partial_update:
        subset_layers = [layer2change]
        subset_layers.extend(layer_info[layer2change]['prev_layer'])
        subset_layers.extend(layer_info[layer2change]['next_layer'])
        
        current_cost_dict, current_rehash_cost_dict, current_block_info_dict = \
        PartialUpdateAuthBlockAssignment(n_layers, layer_info, \
                                         base_dir, timeloop_dir, top_dir, sub_dir, \
                                         configuration_dict, mode="search", \
                                         prev_block_info_dict=current_block_info_dict, subset_layers=subset_layers, \
                                         prev_cost_dict=current_cost_dict, prev_rehash_cost_dict=current_rehash_cost_dict)
        
    else:
        current_cost_dict, current_rehash_cost_dict, current_block_info_dict = \
        PartialUpdateAuthBlockAssignment(n_layers, layer_info, \
                                         base_dir, timeloop_dir, top_dir, sub_dir, \
                                         configuration_dict, \
                                         mode="search", \
                                         prev_block_info_dict=None, subset_layers=[], \
                                         prev_cost_dict=current_cost_dict, prev_rehash_cost_dict=None)
        
    solution_energy, solution_latency, solution_add_mem_traffic = 0, 0, 0
    for key in solution_cost_dict:
        solution_energy += solution_cost_dict[key]['total_energy'] / 10**6
        solution_latency += solution_cost_dict[key]['total_latency']
        solution_add_mem_traffic += solution_cost_dict[key]['add_memory_traffic']
    for key in solution_rehash_cost_dict:
        solution_energy += solution_rehash_cost_dict[key]['total_energy'] / 10**6
        solution_latency += solution_rehash_cost_dict[key]['total_latency']
        solution_add_mem_traffic += solution_rehash_cost_dict[key]['add_memory_traffic']
    
    current_energy, current_latency, current_add_mem_traffic = 0, 0, 0
    for key in current_cost_dict:
        current_energy += current_cost_dict[key]['total_energy'] / 10**6
        current_latency += current_cost_dict[key]['total_latency']
        current_add_mem_traffic += current_cost_dict[key]['add_memory_traffic']
    for key in current_rehash_cost_dict:
        current_energy += current_rehash_cost_dict[key]['total_energy'] / 10**6
        current_latency += current_rehash_cost_dict[key]['total_latency']
        current_add_mem_traffic += current_rehash_cost_dict[key]['add_memory_traffic']
    
    cost_solution = solution_energy * solution_latency
    cost_current = current_energy * current_latency
    cost_diff = (cost_solution - cost_current) / (10 ** 6 * n_layers)
    
    if cost_current < cost_best:
        best_state = copy.deepcopy(current_state)
        cost_best = cost_current
        print("Found best so far: ", best_state, " .. updating cost_best: {}".format(cost_best))
        
    if cost_diff > 0 or (random.uniform(0, 1) < math.exp(cost_diff / current_temp)):
        solution_state = copy.deepcopy(current_state)
        solution_cost_dict = copy.deepcopy(current_cost_dict)
        solution_rehash_cost_dict = copy.deepcopy(current_rehash_cost_dict)
        solution_block_info_dict = copy.deepcopy(current_block_info_dict)
    else:
        # roll-back to the solution state
        xml_file = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, "layer{}".format(layer2change), \
                                  "eval{}".format(solution_state[layer2change - 1]), "timeloop-model.map+stats.xml")
        shutil.copy(xml_file, os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer2change)))
        current_state = copy.deepcopy(solution_state)
        current_cost_dict = copy.deepcopy(solution_cost_dict)
        current_rehash_cost_dict = copy.deepcopy(solution_rehash_cost_dict)
        current_block_info_dict = copy.deepcopy(solution_block_info_dict)
    
    solution_energy, solution_latency, solution_add_mem_traffic = 0, 0, 0
    for key in solution_cost_dict:
        solution_energy += solution_cost_dict[key]['total_energy'] / 10**6
        solution_latency += solution_cost_dict[key]['total_latency']
        solution_add_mem_traffic += solution_cost_dict[key]['add_memory_traffic']
    for key in solution_rehash_cost_dict:
        solution_energy += solution_rehash_cost_dict[key]['total_energy'] / 10**6
        solution_latency += solution_rehash_cost_dict[key]['total_latency']
        solution_add_mem_traffic += solution_rehash_cost_dict[key]['add_memory_traffic']
        
    # print("Solution state: ", solution_state)
    print("Current iteration: {} (temperature: {:.2f}) -- Latency: {} ({:.2f}% faster), Energy: {} uW ({:.2f}% lower), Add Mem Traffic: {} bits ({:.2f}% smaller)"\
          .format(i+1, current_temp, solution_latency, (baseline_latency - solution_latency) / float(baseline_latency) * 100. , \
                  solution_energy, (baseline_energy - solution_energy) / baseline_energy * 100., \
                  solution_add_mem_traffic, (baseline_add_mem_traffic - solution_add_mem_traffic) / float(baseline_add_mem_traffic) * 100.))

    curr_log = [(i + 1), current_temp, cost_solution, solution_latency, solution_energy, solution_add_mem_traffic]
    logs.append(curr_log)
    i += 1
    
    if current_temp < final_temp:
        break
        
print("Execution time: {}s".format(time.time() - start_time))

# dump to csv file
with open(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'SA_{}_top{}_summary.csv'.format(cooling_scheduler, topk)), 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(csv_header)
    writer.writerows(logs)
    
# dump best state & solution state to yaml file
state = {'best': best_state, 'final': solution_state}
with open(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'SA_{}_state.yaml'.format(cooling_scheduler)), 'w') as f:
    _ = yaml.dump(state, f)
        

In [None]:
with open(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'SA_{}_state.yaml'.format('linear')), 'r') as f:
    states = yaml.safe_load(f)
    best_state = states['best']

# move the best solution result
for layer_idx in range(1, n_layers + 1):
    loopnest_id = best_state[layer_idx - 1]
    src = os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer_idx), \
                       'eval{}'.format(loopnest_id))
    src_files = os.listdir(src)
    for file in src_files:
        file_name = os.path.join(src, file)
        if os.path.isfile(file_name):
            shutil.copy(file_name, os.path.join(os.path.join(base_dir, timeloop_dir, 'joint_topk', sub_dir, 'layer{}'.format(layer_idx))))
            

#### 5-3. Evaluate with AuthBlock Assignment (*Crypt-Opt-Cross*) (< 5 min)

In [None]:
from authblock_assignment import AuthBlockAssignment

coc_cost_dict, coc_rehash_cost_dict, coc_block_info_dict = \
AuthBlockAssignment(n_layers, layer_info, \
                    base_dir, timeloop_dir, top_dir, sub_dir, \
                    configuration_dict, mode="search", \
                    joint=True, generate_summary=True, return_cost_dict=True)

### 6. Putting all together: plot the graph (Fig. 11)

#### 6-1. Get the unsecured baseline architecture's latency and energy (~ 30 min)

To measure the slowdown over the unsecure design, we have to run timeloop-topk for the baseline architecture. Repeat the process in Section 3, but with the baseline architecture definition.

In [None]:
def get_cmd(workload_info, layer_id, base_dir, timeloop_dir, sub_dir, top_dir):
    cwd = f"{base_dir/timeloop_dir/'baseline_scheduling'/sub_dir/f'layer{layer_id}'}"
    if 'M' in workload_info['problem']['instance']:
        constraint_pth = base_dir/timeloop_dir/'constraints/*.yaml'
    else:
        # depthwise
        constraint_pth = base_dir/timeloop_dir/'constraints_dw/*.yaml'

    timeloopcmd = f"timeloop-mapper-topk " \
                  f"{base_dir/timeloop_dir/'arch/baseline.yaml'} " \
                  f"{base_dir/timeloop_dir/'arch/components/*.yaml'} " \
                  f"{base_dir/timeloop_dir/'mapper/mapper.yaml'} " \
                  f"{constraint_pth} " \
                  f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_id}.yaml "
    return [cwd, timeloopcmd]

cwd_list = []
cmd_list = []

for layer_id in unique_layers:
    workload_path = os.path.join(base_dir, top_dir, sub_dir, '{}_layer{}.yaml'.format(sub_dir, layer_id))
    with open(workload_path, 'r') as f:
        workload_info = yaml.safe_load(f)
    [cwd, cmd] = get_cmd(workload_info, layer_id, base_dir, timeloop_dir, sub_dir, top_dir)
    cwd_list.append(cwd)
    cmd_list.append(cmd)
    
if not os.path.exists(os.path.join(base_dir, timeloop_dir, 'baseline_scheduling', sub_dir)):
    os.mkdir(os.path.join(base_dir, timeloop_dir, 'baseline_scheduling', sub_dir))
for cwd, cmd in zip(cwd_list, cmd_list):
    print("Executing cmd: {}".format(cmd))
    try:
        os.chdir(cwd)
    except:
        os.mkdir(cwd)
        os.chdir(cwd)
    os.system(cmd)
os.chdir(base_dir)

def convert_to_mapping(base_dir, timeloop_dir, top_dir, sub_dir, layer_idx, topk_idx):
    xml_file = os.path.join(base_dir, timeloop_dir, 'baseline_scheduling', sub_dir, "layer{}".format(layer_idx), \
                            "timeloop-mapper-topk{}.map+stats.xml".format(topk_idx))
    workload_file = os.path.join(base_dir, top_dir, sub_dir, "{}_layer{}.yaml".format(sub_dir, layer_idx))
    with open(workload_file, 'r') as f:
        workload_info = yaml.safe_load(f)
    if 'M' in workload_info['problem']['instance']:
        dw = False
    else:
        dw = True
    arch_constraint_file = os.path.join(base_dir, timeloop_dir, 'constraints_dw' if dw else 'constraints' , \
                                        'eyeriss_like_arch_constraints.yaml' if (configuration_dict['TEMPLATE_DESIGN'] == 'eyeriss_like' or \
                                                                                 configuration_dict['TEMPLATE_DESIGN'] == 'eyeriss_like_hbm2') \
                                        else 'simple_output_stationary_arch_constraints.yaml' if configuration_dict['TEMPLATE_DESIGN'] == 'output_stationary' \
                                        else 'simple_weight_stationary_arch_constraints.yaml')
    mapping = xml2mapping(xml_file, workload_file, arch_constraint_file, dw)
    with open(os.path.join(base_dir, timeloop_dir, 'baseline_scheduling',sub_dir, "layer{}".format(layer_idx), \
                           "mapping{}.yaml".format(topk_idx)), 'w') as f:
        _ = yaml.dump({'mapping': mapping}, f)
        
for layer_idx in unique_layers:
    for k in range(1, topk + 1):
        convert_to_mapping(base_dir, timeloop_dir, top_dir, sub_dir, layer_idx, k)
        
def get_cmd_model(workload_info, layer_id, base_dir, timeloop_dir, sub_dir, top_dir):
    cwd = f"{base_dir/timeloop_dir/'baseline_evaluation'/sub_dir/f'layer{layer_id}'}"
    if 'M' in workload_info['problem']['instance']:
        constraint_pth = base_dir/timeloop_dir/'constraints/*.yaml'
    else:
        # depthwise
        constraint_pth = base_dir/timeloop_dir/'constraints_dw/*.yaml'

    timeloopcmd = f"timeloop-model " \
                  f"{base_dir/timeloop_dir/'arch/baseline.yaml'} " \
                  f"{base_dir/timeloop_dir/'arch/components/*.yaml'} " \
                  f"{base_dir/timeloop_dir/'baseline_scheduling'/sub_dir/f'layer{layer_id}/mapping1.yaml'} " \
                  f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_id}.yaml "
    return [cwd, timeloopcmd]

cwd_list = []
cmd_list = []
for layer_id in unique_layers:
    workload_path = os.path.join(base_dir, top_dir, sub_dir, '{}_layer{}.yaml'.format(sub_dir, layer_id))
    with open(workload_path, 'r') as f:
        workload_info = yaml.safe_load(f)
    [cwd, cmd] = get_cmd_model(workload_info, layer_id, base_dir, timeloop_dir, sub_dir, top_dir)
    cwd_list.append(cwd)
    cmd_list.append(cmd)
    
if not os.path.exists(os.path.join(base_dir, timeloop_dir, 'baseline_evaluation', sub_dir)):
    os.mkdir(os.path.join(base_dir, timeloop_dir, 'baseline_evaluation', sub_dir))
for cwd, cmd in zip(cwd_list, cmd_list):
    print("Executing cmd: {}".format(cmd))
    try:
        os.chdir(cwd)
    except:
        os.mkdir(cwd)
        os.chdir(cwd)
    os.system(cmd)
os.chdir(base_dir)

In [None]:
baseline_latency = 0
baseline_energy = 0

for layer_id in range(1, n_layers + 1):
    if layer_id in layers_exclude_from_search:
        continue
    layer_id_for_timeloop = layer_info[layer_id]['layer_id_for_timeloop']
    stats_file = os.path.join(base_dir, timeloop_dir, 'baseline_evaluation', sub_dir, "layer{}".format(layer_id_for_timeloop), \
                              "timeloop-model.stats.txt")
    with open(stats_file, 'r') as f:
        lines = f.read().split('\n')[-200:]
        for line in lines:
            if line.startswith('Energy'):
                energy = eval(line.split(': ')[1].split(' ')[0]) * float(10**6) # micro to pico
            elif line.startswith('Cycles'):
                cycle = eval(line.split(': ')[1])
                
    baseline_latency += cycle
    baseline_energy += energy

#### 6-2. Calculate the latency and energy for secure accelerators

In [None]:
def calculate_total_stats(cost_dict, rehash_cost_dict, n_layers, exclude_layers):
    total_latency = 0
    total_energy = 0
    additional_off_chip_traffic = [0, 0, 0] # redundant, hash, rehash
    
    for idx in range(1, n_layers + 1):
        if idx in layers_exclude_from_search:
            continue
        total_latency += cost_dict[idx]['total_latency']
        total_energy += cost_dict[idx]['total_energy']
        additional_off_chip_traffic[0] += cost_dict[idx]['total_redundant_bits']
        additional_off_chip_traffic[1] += cost_dict[idx]['total_hash_bits']
    
    for key in rehash_cost_dict.keys():
        idx1 = key[0]
        idx2 = key[1]
        
        if idx1 in layers_exclude_from_search or idx2 in layers_exclude_from_search:
            continue
            
        total_latency += rehash_cost_dict[key]['total_latency']
        total_energy += rehash_cost_dict[key]['total_energy']
        additional_off_chip_traffic[2] += rehash_cost_dict[key]['add_memory_traffic']
        
    return total_latency, total_energy, additional_off_chip_traffic

cts_latency, cts_energy, cts_traffic = calculate_total_stats(cts_cost_dict, cts_rehash_cost_dict, n_layers, layers_exclude_from_search)
cos_latency, cos_energy, cos_traffic = calculate_total_stats(cos_cost_dict, cos_rehash_cost_dict, n_layers, layers_exclude_from_search)
coc_latency, coc_energy, coc_traffic = calculate_total_stats(coc_cost_dict, coc_rehash_cost_dict, n_layers, layers_exclude_from_search)

#### 6-3. Draw the figure

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

def plot_bar_graph(ax, data, labels, show_legend, title, show_ylabel):
    x_locations = np.arange(len(data))

    # Set the width of each bar
    num_groups = data.shape[0]
    num_bars = data.shape[1]
    bar_width = 1.0 / (num_bars + 3)
    space_width = 0.05
    
    # Set the x-axis positions for each group
    x_positions = np.arange(num_groups)

    ax.set_prop_cycle('color', plt.cm.bone(np.linspace(0, 1, num_bars)))
    
    # Define the hatch patterns to use for each bar
    hatch_patterns = ['.', '/', '\\', 'x', '-', '+']

    # Plot each group of bars
    for i in range(num_bars):
        # Calculate the x-axis positions for each bar within each group
        x_pos = x_positions + i * (bar_width + space_width)
        rects = ax.bar(x_pos, data[:, i], width=bar_width, align='edge', label=labels[i], edgecolor='black')

        # Add value of each bar as text
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height:.2f}', xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points", ha='center', va='bottom', fontsize=14)

    # Remove the y-axis ticks
    ax.tick_params(axis='x', which='both', length=0, labelbottom=False)

    # Set the y-axis label
    if show_ylabel:
        ax.set_ylabel('Normalized Latency', fontsize=15)
    
    ax.yaxis.grid(True, linestyle='--', which='major', color='grey', alpha=0.5)
    
    # Set the ylim
    max_val = data.max() + max((data.max() - data.min()) * 0.4, 1)
    ax.set_ylim(0, max_val)

    if show_legend:
        ax.legend(ncol=3, loc='lower center', fontsize=15, bbox_to_anchor=(0.5, -0.25))
        
    ax.set_title(title, fontsize=15, fontweight='bold', y=1.02)
    
fig, ax = plt.subplots(figsize=(4, 3))
plot_bar_graph(ax, np.asarray([float(cts_latency) / float(baseline_latency), \
                               float(cos_latency) / float(baseline_latency), \
                               float(coc_latency) / float(baseline_latency)]).reshape((1, 3)), \
               ['Crypt-Tile-Single', 'Crypt-Opt-Single', 'Crypt-Opt-Cross'], 
               True, model_name, True)

In [None]:
def plot_stacked_bar_graph(ax, data, labels, show_legend, title, ylog, show_ylabel):
    # Define the x locations for each group of bars
    x_locations = np.arange(len(data))

    # Set the width of each bar
    num_groups = data[0].shape[0]
    num_bars = data[0].shape[1]
    bar_width = 1.0 / (num_bars + 3)
    space_width = 0.05
    
    # Set the x-axis positions for each group
    x_positions = np.arange(num_groups)
    
    hatches = ['/', '\\', '|']
    cmap = plt.cm.get_cmap('bone', 4)
    
    # Plot each group of bars
    for i in range(num_bars):
        # Calculate the x-axis positions for each bar within each group
        x_pos = x_positions + i * (bar_width + space_width)
        bottom = np.zeros(num_bars)
        for j in range(len(data)):
            data_ = data[j]
            
            edgecolor = 'white' if i < 2 else 'black'
            ax.bar(x_pos, data_[:, i], width=bar_width, align='edge', edgecolor=edgecolor, hatch=hatches[j], \
                   bottom=bottom, color=cmap(i))
            ax.bar(x_pos, data_[:, i], width=bar_width, align='edge', edgecolor='black', \
                   bottom=bottom, color='none')
            bottom += data_[:, i]

    # Remove the y-axis ticks
    ax.tick_params(axis='x', which='both', length=0, labelbottom=False)

    # Set the ylim
    ax.yaxis.grid(True, linestyle='--', which='major', color='grey', alpha=0.5)
    
    if ylog:
        ax.set_yscale('log')
        
    if show_ylabel:
        ax.set_ylabel('Additional \n Off-chip Traffic (bits)', fontsize=15)
        
    ax.set_title(title, fontsize=15, fontweight='bold', y=1.02)

    if show_legend:
        # Add the legend
        legend_elements = [plt.Rectangle((0, 0), 1, 1, facecolor='none', edgecolor='black', linewidth=1, hatch=hatches[i]) \
                           for i in range(len(data))]
        # legend_handles = [Patch(facecolor='white', edgecolor='white', hatch=hatches[i]) for i in range(len(hatches))]
        ax.legend(legend_elements, labels, \
                  ncol=3, bbox_to_anchor=(0.5, -0.25), loc='lower center', fontsize=15)
        
fig, ax = plt.subplots(figsize=(4, 3))
plot_stacked_bar_graph(ax, \
                       [np.asarray([cts_traffic[2], cos_traffic[2], coc_traffic[2]]).reshape((1, 3)), \
                        np.asarray([cts_traffic[0], cos_traffic[0], coc_traffic[0]]).reshape((1, 3)), \
                        np.asarray([cts_traffic[1], cos_traffic[1], coc_traffic[1]]).reshape((1, 3))], \
                        ['Rehash', 'Redundant', 'Hash'], True, model_name, False, True)