## Step 2: AuthBlock Assignment

Run authentication block assignment for a given loopnest schedule for each layer. In this notebook, we use the per-layer top-1 loopnest schedule and run the authentication block assignment.

In [None]:
%load_ext autoreload
%autoreload 2

import os
import yaml
import shutil
from pathlib import Path

First, we have to define the architecture and the workload similar as before..

In [None]:
configuration_dict = {}

# template design (with constraints and memory hierarchy representing "dataflow")
configuration_dict['TEMPLATE_DESIGN'] = 'eyeriss_like'

# number of bits used for I/O/W; we assume integer
configuration_dict['WORDBITS'] = 16

# DRAM bandwidth setting: words / cycle (not bits / cycle)
configuration_dict['DRAM_READ_BANDWIDTH'] = 32
configuration_dict['DRAM_WRITE_BANDWIDTH'] = 32

# SRAM setting
# - do we have a single shared glb or multiple glbs for each datatype? 
# - for each glb (if shared, just one), define depth/width/#banks and bandwidths
configuration_dict['SRAM_SHARED'] = True
configuration_dict['SRAM_DEPTH'] = [2 ** 13]
configuration_dict['SRAM_WIDTH'] = [2 ** 7]
configuration_dict['SRAM_BANKS'] = [32]                     # SRAM width and SRAM banks define the maximum possible bandwidth
configuration_dict['SRAM_READ_BANDWIDTH'] = [32]
configuration_dict['SRAM_WRITE_BANDWIDTH'] = [32]

# PE array setting
# - shape of PE array X x Y
# - whether a PE has a shared scratchpad or separate scratchpads for each datatype
configuration_dict['PE_X'] = 14
configuration_dict['PE_Y'] = 12
configuration_dict['PE_SPAD_SHARED'] = False
configuration_dict['PE_SPAD_DEPTH'] = [192, 12, 16]         # Weight, IFmap, OFmap
configuration_dict['PE_SPAD_WIDTH'] = [16, 16, 16]

# Cryptographic engine setting
# - type of cryptographic engine + dram (LPDDR4 + AES-GCM)
# - cycle for AES-GCM 
# - whether the cryptographic engines are shared among all datatypes or assigned to each datatype
configuration_dict['CRYPT_ENGINE_TYPE'] = 'effective_lpddr4_aesgcm'
configuration_dict['CRYPT_ENGINE_CYCLE_PER_BLOCK'] = 11            # avg. cycle/128bit

configuration_dict['CRYPT_ENGINE_SHARED'] = False
configuration_dict['CRYPT_ENGINE_COUNT'] = [1, 1, 1]

configuration_dict['EFFECTIVE_CONSERVATIVE'] = True

# Create directory for this configuration if it doesn't exist already
# iterate through design folders to check if any pre-exisiting folder
design_dir = 'designs/{}'.format(configuration_dict['TEMPLATE_DESIGN'])
arch_dir = None
total_vers = 0
for path in os.listdir(design_dir):
    if path != 'template' and os.path.isdir(os.path.join(design_dir, path)):
        try:
            with open(os.path.join(design_dir, path, 'config.yaml'), 'r') as f:
                config_file = yaml.safe_load(f)
            total_vers += 1
            if config_file == configuration_dict:
                arch_dir = path
                print("Pre-existing folder found. Setting the arch_dir to {}".format(arch_dir))
                break
        except:
            print("No config.yaml file in the directory {}".format(str(os.path.join(design_dir, path))))
            
if arch_dir == None:
    raise NameError("Architecture is not found!")

..else if you know which folder you want to use, specify here instead of running the above cell

In [None]:
design_dir = 'designs/{}'.format('eyeriss_like') # define your design name here

arch_ver = 0
arch_dir = 'ver{}'.format(arch_ver)              # sub directory under designs/{name}/{arch_dir}
with open(os.path.join(design_dir, arch_dir, 'config.yaml'), 'r') as f:
    configuration_dict = yaml.safe_load(f)
print("Setting the architecture directory to: {}".format(os.path.join(design_dir, arch_dir)))
print("Printing configuration:")
for key, value in configuration_dict.items():
    print("{}: {}".format(key, value))

Define the workload here. Skip the pytorch2timeloop conversion (should be done when generating loopnests)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision.models as model_zoo

import pytorch2timeloop as pytorch2timeloop

# Note: this version only supports nn.Conv2d (both normal convs and depthwise/pointwise convs) and nn.Linear

# AlexNet
model_name = 'alexnet'
net = model_zoo.alexnet(pretrained=False)

# ResNet18
# model_name = 'resnet18'
# net = model_zoo.resnet18(pretrained=False)

# MobilenetV2
# model_name = 'mobilenet_v2'
# net = model_zoo.mobilenet_v2(pretrained=False)

# Input / Batch info
input_size = (3, 224, 224)
batch_size = 1

print(net)

top_dir = 'workloads'
sub_dir = '{}_batch{}'.format(model_name, batch_size)

In [None]:
base_dir = Path(os.getcwd())
timeloop_dir = 'designs/{}/{}'.format(configuration_dict['TEMPLATE_DESIGN'], arch_dir)

n_layers = 0
layer_dict = {}
layer_duplicate_info = {}
unique_layers = []
for module in net.modules():
    if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
        n_layers += 1
        if n_layers not in layer_dict.keys():
            workload_path = os.path.join(base_dir, top_dir, sub_dir, '{}_layer{}.yaml'.format(sub_dir, n_layers))
            with open(workload_path, 'r') as f:
                workload_info = yaml.safe_load(f)
            layer_dict[n_layers] = workload_info
        
        # identify the earliest duplicate layer
        for key in range(1, n_layers):
            if layer_dict[key] == layer_dict[n_layers]:
                layer_duplicate_info[n_layers] = key
                break
        if n_layers not in layer_duplicate_info:
            unique_layers.append(n_layers)
            
print(layer_duplicate_info)
print(unique_layers)

### Generate a DNN workload layer dependency dictionary

We need to take care of interlayer dependencies. Since the layer number does not imply the actual layer order/dependency, and often multiple layers can be dependent on one layer and vice versa (e.g., residual connections), we create a back-propagation graph and determine the dependency from the graph. 

Depending on your accelerator architecture assumptions, some post-processing operations can be performed on-the-fly. Here, we assume that ReLU activation and batch normalization can be performed on-the-fly (thus, not affect the dependency), while pooling operations and adding multiple feature maps together (e.g., adding residual branches together) cannot be done on-the-fly (thus, break the depenency).

If you have different assumptions about the post-processing or using different versions of PyTorch with different backend operations, `BackpropGraph.isDependentLayer` has to be modified accordingly.

Define whether you want to ignore the interlayer dependency entirely (i.e., always use rehashing). Then, the dependency dictionary will ignore all interlayer dependency. 

Finally, constructing and analyzing the back-propagation grpah can take > 5 minutes for deep models like MobilenetV2. Since this dictionary has to be constructed only once per each model, we first search for existing dictionary. If not, then we generate a graph and create a dictionary and save it. 

In [None]:
ignore_interlayer = False

In [None]:
from pytorch_layer_dependency_utils import BackpropGraph

workload_path = os.path.join(base_dir, top_dir, sub_dir, 'layer_info_{}.yaml'.format('ignore_interlayer' if ignore_interlayer \
                                                                                     else 'interlayer'))
try:
    with open(workload_path, 'r') as f:
        layer_info = yaml.safe_load(f)
    for layer_idx in range(1, n_layers + 1):
        print(layer_idx, layer_info[layer_idx])
except:

    graph = BackpropGraph(net, [1, input_size[0], input_size[1], input_size[2]])
    consecutive_dict, dependent_dict = graph.get_dependency_info()

    # construct layer_info
    layer_info = {}
    for layer_idx in range(1, n_layers + 1):
        info = {}
        if layer_idx in unique_layers:
            info['layer_id_for_timeloop'] = layer_idx
        else:
            info['layer_id_for_timeloop'] = layer_duplicate_info[layer_idx]
        info['prev_layer'] = []
        info['next_layer'] = []
        info['dependent_prev_layer'] = []
        info['dependent_next_layer'] = []
        layer_info[layer_idx] = info

    for layer_idx in range(1, n_layers + 1):
        consecutive = consecutive_dict[layer_idx]
        dependent = dependent_dict[layer_idx]
        layer_info[layer_idx]['next_layer'].extend(consecutive)
        for i in consecutive:
            layer_info[i]['prev_layer'].append(layer_idx)
        if len(dependent) > 0 and not ignore_interlayer:
            layer_info[layer_idx]['dependent_next_layer'].extend(dependent)   
            for i in dependent:
                layer_info[i]['dependent_prev_layer'].append(layer_idx)

    for layer_idx in range(1, n_layers + 1):
        print(layer_idx, layer_info[layer_idx])

    # store therresults - this can take long for deep models like MobileNet..
    # f"{base_dir/top_dir/sub_dir/sub_dir}_layer{layer_id}.yaml "
    with open(workload_path, 'w') as f:
        _ = yaml.dump(layer_info, f)


### Define hyperparameters for the AuthBlock assignment

Define whether you want to use 1) a fixed AuthBlock size and orientation, 2) tile-as-an-AuthBlock, or 3) search for the optimal AuthBlock assignment. For this experiment, we assume that the tag size is 64 bits. 

In [None]:
WORD_SIZE = configuration_dict['WORDBITS']
TAG_SIZE = 64

# Define which method you want to use for AuthBlock assignment
# mode = "fixed" 
# mode = "tile"
mode = "search"

# If you want to use a fixed assignment, define the size and orientation here:
# authblock_size = 8                     # in words
# authblock_orientation = [3, 2, 1, 0]   # [column, row, in_channel, out_channel] --> column is the most innermost loop

# If you want to skip the search for certain layers (e.g., non-convolution layers in AlexNet), define here:
# search_input_layers_except = []

# Print all stats?
print_all_stats = True

In [None]:
search_block_size = {}
predefined_u = {}
predefined_perm = {}

if mode == "fixed":
    assert (authblock_size > 1 and authblock_orientation != None)
    search_flag = False
    predefined_u_value = authblock_size
    predefined_perm_value = authblock_orientation
        
elif mode == "tile":
    search_flag = False
    predefined_u_value = 'tile'
    predefined_perm_value = None
    
elif mode == "search":
    search_flag = True
    predefined_u_value = 'tile'
    predefined_perm_value = None
    
for layer_id in range(1, n_layers+1):
    search_block_size[layer_id] = search_flag
    predefined_u[layer_id] = {}
    predefined_perm[layer_id] = {}
    predefined_u[layer_id]['W'] = predefined_u_value
    predefined_u[layer_id]['I'] = predefined_u_value if mode != "search" else -1
    predefined_u[layer_id]['O'] = predefined_u_value
    predefined_perm[layer_id]['W'] = predefined_perm_value
    predefined_perm[layer_id]['I'] = predefined_perm_value
    predefined_perm[layer_id]['O'] = predefined_perm_value

### Generate memory traffic dict based on your choice of AuthBlock assignment method

In [None]:
from testbench_utils import generate_memory_traffic_dict

memory_traffic_dict, block_info_dict = generate_memory_traffic_dict(n_layers, layer_info, predefined_u, predefined_perm, \
                                                                    base_dir, timeloop_dir, top_dir, sub_dir, \
                                                                    search_block_size, u_multiple_of=128//WORD_SIZE, \
                                                                    WORD_SIZE=WORD_SIZE, TAG_SIZE=TAG_SIZE)

if print_all_stats:
    for layer_id in range(1, n_layers + 1):
        w_memory_traffic = memory_traffic_dict[layer_id]['W']
        i_memory_traffic = memory_traffic_dict[layer_id]['I']
        o_memory_traffic = memory_traffic_dict[layer_id]['O']

        w_block_info = block_info_dict[layer_id]['W']
        i_block_info = block_info_dict[layer_id]['I']
        o_block_info = block_info_dict[layer_id]['O']

        print("\nLayer {} stats:".format(layer_id))
        print("\n--Layer info: layer id in timeloop is {}".format(layer_info[layer_id]['layer_id_for_timeloop']))
        print("\n--Weights:")
        print("----Base read: {}".format(w_memory_traffic['base_read']))
        print("----Base write: {}".format(w_memory_traffic['base_write']))
        print("----Redundant read: {}".format(w_memory_traffic['redundant_read']))
        print("----Redundant write: {}".format(w_memory_traffic['redundant_write']))
        print("----Tag read: {}".format(w_memory_traffic['tag_read']))
        print("----Tag write: {}".format(w_memory_traffic['tag_write']))
        print("----Auth block size: {}".format(w_block_info['u']))

        print("\n--Inputs:")
        print("----Base read: {}".format(i_memory_traffic['base_read']))
        print("----Base write: {}".format(i_memory_traffic['base_write']))
        print("----Redundant read: {}".format(i_memory_traffic['redundant_read']))
        print("----Redundant write: {}".format(i_memory_traffic['redundant_write']))
        print("----Tag read: {}".format(i_memory_traffic['tag_read']))
        print("----Tag write: {}".format(i_memory_traffic['tag_write']))
        print("----Auth block size: {}".format(i_block_info['u']))
        print("----Auth tile read permutation: {}".format(i_block_info['permutation']))
        if 'shared' in i_block_info.keys():
            print("----Auth block assignment shared layers: {}".format(i_block_info['shared']))
        if 'reference_layer' in i_block_info.keys():
            print("----Reference layer among shared layers: {}".format(i_block_info['reference_layer']))

        print("\n--Outputs:")
        print("----Base read: {}".format(o_memory_traffic['base_read']))
        print("----Base write: {}".format(o_memory_traffic['base_write']))
        print("----Redundant read: {}".format(o_memory_traffic['redundant_read']))
        print("----Redundant write: {}".format(o_memory_traffic['redundant_write']))
        print("----Tag read: {}".format(o_memory_traffic['tag_read']))
        print("----Tag write: {}".format(o_memory_traffic['tag_write']))
        print("----Auth block size: {}".format(o_block_info['u']))

        print("\n===============================================================================================")

### Generate rehash information dict

In [None]:
from testbench_utils import generate_rehash_info_dict
rehash_info_dict = generate_rehash_info_dict(n_layers, layer_info, block_info_dict, \
                                             base_dir, timeloop_dir, top_dir, sub_dir)

if print_all_stats:
    for key in rehash_info_dict.keys():
        print("Rehashing between layers {} - {}".format(key[0], key[1]))
        print("--Base read (encrypted ofmap read of prev layer): {}".format(rehash_info_dict[key]['base_read']))
        print("--Tag read (tags to check the integrity when reading the ofmap of prev layer): {}".format(rehash_info_dict[key]['tag_read']))
        print("--Base write (once we re-encrypt the data according to ifmap of next layer): {}".format(rehash_info_dict[key]['base_write']))
        print("--Tag write (we have to update tags according to the new blocks): {}".format(rehash_info_dict[key]['tag_write']))

### Convert memory traffic information to cryptographic action counts

In [None]:
from testbench_utils import get_action_dict, get_action_dict_for_rehash
cryptographic_action_count_dict = get_action_dict(n_layers, memory_traffic_dict, block_info_dict, \
                                                  WORD_SIZE, TAG_SIZE, AES_DATAPATH=128)
rehash_action_count_dict = get_action_dict_for_rehash(rehash_info_dict, block_info_dict, WORD_SIZE, TAG_SIZE, AES_DATAPATH=128)

if print_all_stats:
    print("Layers\n\n")
    for layer_id in range(1, n_layers + 1):
        print("\nLayer {} stats:".format(layer_id))

        weight_count = cryptographic_action_count_dict[layer_id]['W']
        print("\n--Weights:")
        print("----AES encryption/decryption count: {}".format(weight_count['aes_engine_count']))
        print("----GF multiplication count: {}".format(weight_count['gf_mult_count']))
        print("----XOR count: {}".format(weight_count['xor_count']))
        print("----Additional memory read (bits): {}".format(weight_count['additional_read_bits']))
        print("----Additional memory write (bits): {}".format(weight_count['additional_write_bits']))

        input_count = cryptographic_action_count_dict[layer_id]['I']
        print("\n--Inputs:")
        print("----AES encryption/decryption count: {}".format(input_count['aes_engine_count']))
        print("----GF multiplication count: {}".format(input_count['gf_mult_count']))
        print("----XOR count: {}".format(input_count['xor_count']))
        print("----Additional memory read (bits): {}".format(input_count['additional_read_bits']))
        print("----Additional memory write (bits): {}".format(input_count['additional_write_bits']))

        output_count = cryptographic_action_count_dict[layer_id]['O']
        print("\n--Outputs:")
        print("----AES encryption/decryption count: {}".format(output_count['aes_engine_count']))
        print("----GF multiplication count: {}".format(output_count['gf_mult_count']))
        print("----XOR count: {}".format(output_count['xor_count']))
        print("----Additional memory read (bits): {}".format(output_count['additional_read_bits']))
        print("----Additional memory write (bits): {}".format(output_count['additional_write_bits']))
    
    print("\n\nRehash\n\n")
    for key in rehash_action_count_dict:
        print("\nRehash for layer {} - {}:".format(key[0], key[1]))
        print("--AES encryption/decryption count: {}".format(rehash_action_count_dict[key]['aes_engine_count']))
        print("--GF multiplication count: {}".format(rehash_action_count_dict[key]['gf_mult_count']))
        print("--XOR count: {}".format(rehash_action_count_dict[key]['xor_count']))
        print("--Additional memory read (bits): {}".format(rehash_action_count_dict[key]['additional_read_bits']))
        print("--Additional memory write (bits): {}".format(rehash_action_count_dict[key]['additional_write_bits']))

### Generate a final stat.csv file with estimated latency & energy

This is energy / latency estimate form Banerjee (2017) for AES-GCM engine implementations in 40nm. We ignore the XOR energy for now. If you want to use different energy / latency profile for AES-GCM (+ XOR), define it here.

In [None]:
# Fully pipelined AES-GCM: 1 cycle / 1 cycle
if configuration_dict['CRYPT_ENGINE_CYCLE_PER_BLOCK'] == 1:
    AESGCM_energy_profile = {'AES': 1.29 * 128, 'GCM': 57.7, 'XOR': 0} # pJ / op (e.g., 128-bit AES, 128-bit * 128-bit GCM)
    AESGCM_latency_profile = {'AES': 1, 'GCM': 1, 'XOR': 1} # cycle / op 

# Parallel AES-GCM: 11 cycle / 8 cycle
elif configuration_dict['CRYPT_ENGINE_CYCLE_PER_BLOCK'] == 11:
    AESGCM_energy_profile = {'AES': 1.52 * 128, 'GCM': 82.4, 'XOR': 0} # pJ / op (e.g., 128-bit AES, 128-bit * 128-bit GCM)
    AESGCM_latency_profile = {'AES': 11, 'GCM': 8, 'XOR': 1} # cycle / op 
    
# Serial AES-GCM: 336 cycle / 128 cycle
elif configuration_dict['CRYPT_ENGINE_CYCLE_PER_BLOCK'] == 336:
    AESGCM_energy_profile = {'AES': 6 * 128, 'GCM': 345.6, 'XOR': 0} # pJ / op (e.g., 128-bit AES, 128-bit * 128-bit GCM)
    AESGCM_latency_profile = {'AES': 336, 'GCM': 128, 'XOR': 1} # cycle / op 

In [None]:
import csv

evaluation_folder = 'evaluation'

# For LPDDR4
MEMORY_READ_PER_BIT_ENERGY = 8
MEMORY_WRITE_PER_BIT_ENERGY = 8

summaries = []
for layer_id in range(1, n_layers + 1):
    summary = [layer_id]
    layer_id_for_timeloop = layer_info[layer_id]['layer_id_for_timeloop']
    stats_file = os.path.join(base_dir, timeloop_dir, evaluation_folder, sub_dir, "layer{}".format(layer_id_for_timeloop), \
                              "timeloop-model.stats.txt")
    with open(stats_file, 'r') as f:
        lines = f.read().split('\n')[-200:]
        for line in lines:
            if line.startswith('Energy'):
                energy = eval(line.split(': ')[1].split(' ')[0]) * float(10**6) # micro to pico
            elif line.startswith('Cycles'):
                cycle = eval(line.split(': ')[1])
    
    summary.extend([cycle, energy])
    
    total_memory_read_bits = 0
    total_memory_write_bits = 0
    additional_read_bits = 0
    additional_write_bits = 0
    tag_bits = 0
    redundant_bits = 0
    
    aes_counts = []
    gcm_counts = []
    xor_counts = []
    
    # weights
    weight_memory_traffic_dict = memory_traffic_dict[layer_id]['W']
    weight_block_info_dict = block_info_dict[layer_id]['W']
    weight_action_count_dict = cryptographic_action_count_dict[layer_id]['W']
    
    total_memory_read_bits += weight_action_count_dict['total_read_bits']
    total_memory_write_bits += weight_action_count_dict['total_write_bits']
    
    additional_read_bits +=  weight_action_count_dict['additional_read_bits']
    additional_write_bits += weight_action_count_dict['additional_write_bits']
    
    tag_bits += weight_action_count_dict['tag_bits']
    redundant_bits += weight_action_count_dict['redundant_bits']
    
    # aes counts, gcm_counts, xor_counts, additional_read_bits, additional_write_bits, u
    summary.extend([weight_action_count_dict['aes_engine_count'], \
                    weight_action_count_dict['gf_mult_count'], \
                    weight_action_count_dict['xor_count'], \
                    weight_action_count_dict['additional_read_bits'], \
                    weight_action_count_dict['additional_write_bits'], \
                    weight_block_info_dict['u'], \
                    weight_action_count_dict['tag_bits'], \
                    weight_action_count_dict['redundant_bits']])
    
    aes_counts.append(weight_action_count_dict['aes_engine_count'])
    gcm_counts.append(weight_action_count_dict['gf_mult_count'])
    xor_counts.append(weight_action_count_dict['xor_count'])
    
    # inputs
    input_memory_traffic_dict = memory_traffic_dict[layer_id]['I']
    input_block_info_dict = block_info_dict[layer_id]['I']
    input_action_count_dict = cryptographic_action_count_dict[layer_id]['I']
    
    total_memory_read_bits += input_action_count_dict['total_read_bits']
    total_memory_write_bits += input_action_count_dict['total_write_bits']
    
    additional_read_bits += input_action_count_dict['additional_read_bits']
    additional_write_bits += input_action_count_dict['additional_write_bits']
    
    tag_bits += input_action_count_dict['tag_bits']
    redundant_bits += input_action_count_dict['redundant_bits']
    
    # aes counts, gcm_counts, xor_counts, additional_read_bits, additional_write_bits, u
    summary.extend([input_action_count_dict['aes_engine_count'], \
                    input_action_count_dict['gf_mult_count'], \
                    input_action_count_dict['xor_count'], \
                    input_action_count_dict['additional_read_bits'], \
                    input_action_count_dict['additional_write_bits'], \
                    input_block_info_dict['u'], \
                    input_block_info_dict['permutation'], \
                    input_action_count_dict['tag_bits'], \
                    input_action_count_dict['redundant_bits']])
    
    aes_counts.append(input_action_count_dict['aes_engine_count'])
    gcm_counts.append(input_action_count_dict['gf_mult_count'])
    xor_counts.append(input_action_count_dict['xor_count'])
    
    # outputs
    output_memory_traffic_dict = memory_traffic_dict[layer_id]['O']
    output_block_info_dict = block_info_dict[layer_id]['O']
    output_action_count_dict = cryptographic_action_count_dict[layer_id]['O']
    
    total_memory_read_bits += output_action_count_dict['total_read_bits']
    total_memory_write_bits += output_action_count_dict['total_write_bits']
    
    additional_read_bits += output_action_count_dict['additional_read_bits']
    additional_write_bits += output_action_count_dict['additional_write_bits']
    
    tag_bits += output_action_count_dict['tag_bits']
    redundant_bits += output_action_count_dict['redundant_bits']
    
    # aes counts, gcm_counts, xor_counts, additional_read_bits, additional_write_bits, u
    summary.extend([output_action_count_dict['aes_engine_count'], \
                    output_action_count_dict['gf_mult_count'], \
                    output_action_count_dict['xor_count'], \
                    output_action_count_dict['additional_read_bits'], \
                    output_action_count_dict['additional_write_bits'], \
                    output_block_info_dict['u'], \
                    output_action_count_dict['tag_bits'], \
                    output_action_count_dict['redundant_bits']])
    
    aes_counts.append(output_action_count_dict['aes_engine_count'])
    gcm_counts.append(output_action_count_dict['gf_mult_count'])
    xor_counts.append(output_action_count_dict['xor_count'])
    
    # get crypto-latency for this layer
    if configuration_dict['CRYPT_ENGINE_SHARED']:
        aes_latency = sum(aes_counts) * (AESGCM_latency_profile['AES'] + AESGCM_latency_profile['XOR']) \
                      / sum(configuration_dict['CRYPT_ENGINE_COUNT'])
        gcm_latency = sum(gcm_counts) * (AESGCM_latency_profile['GCM'] + AESGCM_latency_profile['XOR']) \
                      / sum(configuration_dict['CRYPT_ENGINE_COUNT'])
        crypt_latency = max(aes_latency, gcm_latency) # assuming AES and GF can be pipelined
    else:
        aes_latency = [aes_counts[i] * (AESGCM_latency_profile['AES'] + AESGCM_latency_profile['XOR']) \
                       / (configuration_dict['CRYPT_ENGINE_COUNT'][i]) for i in range(3)]
        gcm_latency = [gcm_counts[i] * (AESGCM_latency_profile['GCM'] + AESGCM_latency_profile['XOR']) \
                       / (configuration_dict['CRYPT_ENGINE_COUNT'][i]) for i in range(3)]
        crypt_latency = max([max(aes_latency[i], gcm_latency[i]) for i in range(3)])
    
    memory_latency = max(total_memory_read_bits / (configuration_dict['DRAM_READ_BANDWIDTH'] * configuration_dict['WORDBITS']), \
                         total_memory_write_bits / (configuration_dict['DRAM_WRITE_BANDWIDTH'] * configuration_dict['WORDBITS']))
    total_latency = max(cycle, crypt_latency, memory_latency)
    
    summary.extend([crypt_latency, memory_latency, total_latency])
        
    # get crypto-energy for this layer
    aes_energy = sum(aes_counts) * AESGCM_energy_profile['AES']
    gcm_energy = sum(gcm_counts) * AESGCM_energy_profile['GCM']
    xor_energy = sum(xor_counts) * AESGCM_energy_profile['XOR']
    
    memory_energy = additional_read_bits * MEMORY_READ_PER_BIT_ENERGY + additional_write_bits * MEMORY_WRITE_PER_BIT_ENERGY
    total_energy = energy + (aes_energy + gcm_energy + xor_energy) + memory_energy
    
    additional_mem_traffic = additional_read_bits + additional_write_bits
    
    summary.extend([(aes_energy + gcm_energy + xor_energy), memory_energy, total_energy])
    summary.extend([additional_mem_traffic, tag_bits, redundant_bits])
    
    summaries.append(summary)
    
    # print(key, additional_mem_traffic)

for key in rehash_action_count_dict.keys():
    summary = ["Rehash{}-{}".format(key[0], key[1])]
    summary.extend([0, 0, \
                    0, 0, 0, 0, 0, 0, 0, 0, \
                    0, 0, 0, 0, 0, 0, 0, 0, 0, \
                    0, 0, 0, 0, 0, 0, 0, 0])
    
    # when there are non-shared AES-GCM engine
    # rehashing only for ifmap - ofmap
    aes_latency = rehash_action_count_dict[key]['aes_engine_count'] * (AESGCM_latency_profile['AES'] + AESGCM_latency_profile['XOR']) \
                  / (configuration_dict['CRYPT_ENGINE_COUNT'][1] + configuration_dict['CRYPT_ENGINE_COUNT'][2])
    gcm_latency = rehash_action_count_dict[key]['gf_mult_count'] * (AESGCM_latency_profile['GCM'] + AESGCM_latency_profile['XOR']) \
                  / (configuration_dict['CRYPT_ENGINE_COUNT'][1] + configuration_dict['CRYPT_ENGINE_COUNT'][2])
    crypt_latency = max(aes_latency, gcm_latency)
    
    aes_energy = rehash_action_count_dict[key]['aes_engine_count'] * AESGCM_energy_profile['AES']
    gcm_energy = rehash_action_count_dict[key]['gf_mult_count'] * AESGCM_energy_profile['GCM']
    xor_energy = rehash_action_count_dict[key]['xor_count'] * AESGCM_energy_profile['XOR']
    
    memory_latency = max(rehash_action_count_dict[key]['total_read_bits'] / \
                         (configuration_dict['DRAM_READ_BANDWIDTH'] * configuration_dict['WORDBITS']), \
                         rehash_action_count_dict[key]['total_write_bits'] / \
                         (configuration_dict['DRAM_WRITE_BANDWIDTH'] * configuration_dict['WORDBITS']))
    memory_energy = rehash_action_count_dict[key]['total_read_bits'] * MEMORY_READ_PER_BIT_ENERGY + \
                    rehash_action_count_dict[key]['total_write_bits'] * MEMORY_WRITE_PER_BIT_ENERGY
    
    additional_mem_traffic = rehash_action_count_dict[key]['total_read_bits'] + \
                             rehash_action_count_dict[key]['total_write_bits']
    
    tag_bits = rehash_action_count_dict[key]['additional_read_bits'] + \
               rehash_action_count_dict[key]['additional_write_bits']
    
    total_latency = max(crypt_latency, memory_latency)
    total_energy = (aes_energy + gcm_energy + xor_energy) + memory_energy
    
    summary.extend([crypt_latency, memory_latency, total_latency, \
                    (aes_energy + gcm_energy + xor_energy), memory_energy, total_energy])
    summary.extend([additional_mem_traffic, tag_bits, 0])
    
    summaries.append(summary)
    
summary_header = ['Layer#', 'Baseline Cycle', 'Baseline Energy (pJ)', \
                  'Weight AES Count', 'Weight GFMult Count', 'Weight XOR Count', \
                  'Weight Additional Memory Read (bits)', 'Weight Additional Memory Write (bits)', \
                  'Weight Authentication Block Size', 'Weight Tag Bits', 'Weight Redundant Bits', \
                  'Input AES Count', 'Input GFMult Count', 'Input XOR Count', \
                  'Input Additional Memory Read (bits)', 'Input Additional Memory Write (bits)', \
                  'Input Authentication Block Size', 'Input Authentication Permutation', \
                  'Input Tag Bits', 'Input Redundant Bits', \
                  'Output AES Count', 'Output GFMult Count', 'Output XOR Count', \
                  'Output Additional Memory Read (bits)', 'Output Additional Memory Write (bits)', \
                  'Output Authentication Block Size', 'Output Tag Bits', 'Output Redundant Bits', \
                  'CryptEngine Latency', 'Final Memory Read/Write Latency', 'Total Latency', \
                  'CryptEngine Energy (pJ)', 'Additional Memory Read/Write Energy (pJ)', 'Total Energy (pJ)', \
                  'Additional Memory Traffic (bits)', 'Tag Bits', 'Redundant Bits']

write_dst = os.path.join(base_dir, timeloop_dir, evaluation_folder, sub_dir, 'stat.csv')
with open(write_dst, 'w') as f:
    csv_file = csv.writer(f)
    csv_file.writerow(summary_header)
    for result in summaries:
        csv_file.writerow(result)

### Or run them all together with one top-level function

In [None]:
from authblock_assignment import AuthBlockAssignment

AuthBlockAssignment(n_layers, layer_info, \
                    base_dir, timeloop_dir, top_dir, sub_dir, \
                    configuration_dict, \
                    mode="search", authblock_size=-1, authblock_orientation=None, \
                    joint=False, generate_summary=True, return_cost_dict=False)