In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("./model")

In [3]:
import torch
import argparse
import functools
import matplotlib.pyplot as plt
import numpy as np
from dataclasses import dataclass
from typing import List, Tuple, Dict, Any, Optional

In [4]:
import analyze
from quant import *
from outlier import *
from eval import *
from collections import defaultdict
from pprint import pprint
from modelutils_llama import quantize_model_llama, reorder_model_llama, quantize_model_gptq_llama,  add_act_quant_wrapper_llama
from modelutils_opt import quantize_model_opt, reorder_model_opt, quantize_model_gptq_opt,  add_act_quant_wrapper_opt
from modelutils_mixtral import quantize_model_mixtral, add_act_quant_wrapper_mixtral, reorder_model_mixtral
from parallel_utils import map_layers_to_multi_gpus
from LMClass import LMClass
from eval import pattern_match
from lm_eval import tasks as lm_tasks
from lm_eval import evaluator as lm_evaluator
from datautils import *
import qLinearLayer

In [5]:
torch.set_printoptions(precision=10)

In [6]:
DEV = torch.device('cuda:0')
model = torch.load("./saved/llama2-7b_quantized.pth").to(DEV)

In [7]:
inputs = torch.load("./saved/llama2-7b_inputs.pth")
outputs = torch.load("./saved/llama2-7b_outputs.pth")

# PPL

In [27]:
datasets = ['wikitext2']

for dataset in datasets:
    dataloader, testloader = get_loaders(
        dataset, seed=0, model="./llama2-7b", seqlen=model.seqlen
    )

    print(f"Evaluating {dataset} ...")
    ppl = llama_eval(model, testloader, DEV)

    print(f"targetResult,{dataset},{ppl:.3f}")

Evaluating wikitext2 ...


100%|██████████| 32/32 [02:04<00:00,  3.90s/it]


targetResult,wikitext2,6.025


# Zero Shot

In [7]:
parser = argparse.ArgumentParser()

parser.add_argument(
    'model', type=str,
    help='LlaMa model to load; pass location of hugginface converted checkpoint.'
)
parser.add_argument(
    'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
    help='Where to extract calibration data from.'
)
parser.add_argument(
    '--seed',
    type=int, default=0, 
    help='Seed for sampling the calibration data.'
)
parser.add_argument(
    '--nsamples', type=int, default=128,
    help='Number of calibration data samples.'
)
# Quantization Method
parser.add_argument(
    '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16],
    help='#bits to use for quantizing weight; use 16 for evaluating base model.'
)
parser.add_argument(
    '--abits', type=int, default=16, choices=[2, 3, 4, 8, 16],
    help='#bits to use for quantizing activation; use 16 for evaluating base model.'
)
parser.add_argument(
    '--exponential', action='store_true',
    help='Whether to use exponent-only for weight quantization.'
)
parser.add_argument(
    '--a_sym', action='store_true',
    help='Whether to perform symmetric quantization. Default is asymmetric.'
)
parser.add_argument(
    '--w_sym', action='store_true',
    help='Whether to perform symmetric quantization. Default is asymmetric.'
)
parser.add_argument(
    '--static', action='store_true',
    help='Whether to perform static quantization (For activtions). Default is dynamic. (Deprecated in Atom)'
)
parser.add_argument(
    '--weight_group_size', type=int, default=0, choices=[0, 32, 64, 128, 256, 384, 768],
    help='Group size when quantizing weights. Using 128 as default quantization group.'
)
parser.add_argument( #- ??
    '--weight_channel_group', type=int, default=1,
    help='Group size of channels that will quantize together. (only for weights now)'
)
parser.add_argument(
    '--act_group_size', type=int, default=0, choices=[0, 64, 128, 256, 384, 768],
    help='Group size when quantizing activations. Using 128 as default quantization group.'
)
parser.add_argument(
    '--reorder', action='store_true',
    help='Whether to keep salient weight unquantized.'
)
parser.add_argument(
    '--act_sort_metric', type=str, default='hessian', choices=['abs_mean', 'hessian'],
    help='The metric used to sort the activations.'
)
parser.add_argument(
    '--keeper', type=int, default=0,
    help='Group size to keep outliers.'
)
parser.add_argument(
    '--keeper_precision', type=int, default=0, choices=[0, 1, 2, 3],
    help='Precision to keep outliers. 0 for FP16; 1 for E5M2; 2 for E4M3; 3 for INT8 Quant.'
)
parser.add_argument(
    '--cache_index', action='store_true',
    help='Whether to use cached reorder index'
)
parser.add_argument(
    '--tiling', type=int, default=0, choices=[0, 16],
    help='Tile-wise quantization granularity (Deprecated in Atom).'
)
parser.add_argument(
    '--kv_cache', action='store_true',
    help='Whether to quant KV_Cache'
)
parser.add_argument(
    '--use_gptq', action='store_true',
    help='Whether to use GPTQ for weight quantization.'
)
parser.add_argument(
    '--percdamp', type=float, default=.01,
    help='Percent of the average Hessian diagonal to use for dampening.'
)
parser.add_argument(
    '--a_clip_ratio', type=float, default=1.0,
    help='Clip ratio for activation quantization. new_max = max * clip_ratio'
)
parser.add_argument(
    '--w_clip_ratio', type=float, default=1.0,
    help='Clip ratio for weight quantization. new_max = max * clip_ratio'
)
parser.add_argument(
    '--kv_clip_ratio', type=float, default=1.0,
    help='Clip ratio for kv cache quantization. new_max = max * clip_ratio'
)
parser.add_argument(
    "--eval_ppl", action="store_true",
    help='Whether to evaluate perplexity.'
)
parser.add_argument(
    "--eval_common_sense", action="store_true",
    help='Whether to evaluate zero-shot accuray on commonsense reasoning tasks.'
)
parser.add_argument(
    "--multigpu", action="store_true", 
    help="at eval, map model to multiple gpus"
)
parser.add_argument(
    "--lm_eval_num_fewshot", type=int, default=0, 
    help="Number of shots in lm evaluation. Default is 0 for zero-shot."
)
parser.add_argument(
    "--lm_eval_limit", type=int, default=-1, 
    help="Limit the number of examples in lm evaluation"
)
parser.add_argument(
    '--save_dir', type=str, default='./saved',
    help='Path to store the reordering indices and quantized weights.'
)
parser.add_argument(
    '--quant_type', type=str, default='int', choices=['int', 'fp'],
    help='Determine the mapped data format by quant_type + n_bits. e.g. int8, fp4.'
)
parser.add_argument(
    '--save_model', action="store_true", default=True,
    help='Whether to save the quantized model.'
)

args = parser.parse_args(
  args = [
    "/root/project/Atom/llama2-7b",
    "wikitext2",
    "--wbits", "4", "--abits", "4", "--a_sym", "--w_sym", "--save_model",
    "--act_group_size", "128", "--weight_group_size", "128", "--weight_channel_group", "2",
    "--reorder", "--act_sort_metric", "hessian", "--cache_index",
    "--a_clip_ratio", "0.9", "--w_clip_ratio", "0.85", "--kv_clip_ratio", "1.0",
    "--keeper", "128", "--keeper_precision", "3", "--kv_cache", "--use_gptq",
    "--eval_common_sense", "--lm_eval_limit", "-1", "--multigpu"
  ]
)

In [11]:
lm = LMClass(args, model)
lm.seqlen = 2048
lm.model.eval()
for param in lm.model.parameters():
    param.requires_grad = False

if args.multigpu:
    if ("llama" in args.model.lower()) or ("mixtral" in args.model.lower()):
        map_layers_to_multi_gpus(lm.model.model.layers)
        input_device = lm.model.model.layers[0].device
        output_device = lm.model.model.layers[-1].device
        assert input_device == output_device
        lm._device = input_device
        lm.model.model.embed_tokens.to(input_device)
        lm.model.model.norm.to(output_device)
        lm.model.lm_head.to(output_device)
    elif "opt" in args.model.lower():
        map_layers_to_multi_gpus(lm.model.model.decoder.layers)
        input_device = lm.model.model.decoder.layers[0].device
        output_device = lm.model.model.decoder.layers[-1].device
        assert input_device == output_device
        lm._device = input_device
        lm.model.model.decoder.embed_tokens.to(input_device)
        lm.model.model.decoder.embed_positions.to(input_device)
        lm.model.model.decoder.final_layer_norm.to(input_device)
        lm.model.lm_head.to(output_device)
else:
    lm._device = DEV
    lm.model = lm.model.to(lm.device)

results = {}
tasks_str = "piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande"
task_names = pattern_match(tasks_str.split(","), lm_tasks.ALL_TASKS)
print(f"Selected Tasks: {task_names}")

task_dict = lm_tasks.get_task_dict(task_names)
t_results = lm_evaluator.evaluate(
    lm,
    task_dict,
    num_fewshot=args.lm_eval_num_fewshot,
    limit=None if args.lm_eval_limit == -1 else args.lm_eval_limit
)
results.update(t_results)
pprint(results)

results_dict = results['results']
for task_name in tasks_str.split(','):
    if task_name in ['piqa', 'arc_easy', 'arc_challenge', 'hellaswag']:
        print(f"INFO {task_name} : {results_dict[task_name]['acc_norm']*100:.2f}")
    else:
        print(f"INFO {task_name} : {results_dict[task_name]['acc']*100:.2f}")

vocab size:  32000
map layer 0 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 1 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 2 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 3 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 4 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 5 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 6 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 7 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 8 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576, 13504)]
map layer 9 to gpu 0, [(0, 24576, 13504), (1, 24576, 14086), (2, 24576, 13578), (3, 24576,

100%|██████████| 67078/67078 [2:14:11<00:00,  8.33it/s]  


{'results': {'arc_challenge': {'acc': 0.3873720136518771,
                               'acc_norm': 0.3822525597269625,
                               'acc_norm_stderr': 0.014200454049979277,
                               'acc_stderr': 0.014235872487909872},
             'arc_easy': {'acc': 0.6574074074074074,
                          'acc_norm': 0.5109427609427609,
                          'acc_norm_stderr': 0.010257326131172868,
                          'acc_stderr': 0.009738105469984193},
             'boolq': {'acc': 0.6892966360856269,
                       'acc_stderr': 0.008094100581882606},
             'hellaswag': {'acc': 0.5343557060346544,
                           'acc_norm': 0.6984664409480184,
                           'acc_norm_stderr': 0.004579859084500781,
                           'acc_stderr': 0.004977988452502639},
             'piqa': {'acc': 0.7606093579978237,
                      'acc_norm': 0.7470076169749728,
                      'acc_norm_stderr':

# Replacement experiment

In [25]:
layer_num = 28

test_input = inputs[f"layers.{layer_num}.self_attn.q_proj.input"][0].float()
test_weight = model.model.layers[layer_num].self_attn.q_proj.weight.cpu().float()
test_ref_output = torch.functional.F.linear(test_input, test_weight, None)

layer_v2 = qLinearLayer.QLinearLayerV2()
layer_v2.args = model.model.layers[layer_num].self_attn.q_proj.args
layer_v2.weight = model.model.layers[layer_num].self_attn.q_proj.weight
layer_v2.bias = model.model.layers[layer_num].self_attn.q_proj.bias
layer_v2 = layer_v2.to(DEV)
test_eval_output = layer_v2(test_input.to(DEV))

print(test_ref_output)
print(test_eval_output)

print(((test_ref_output - test_eval_output.cpu()) > 4e-2).sum())

tensor([[[-0.5220630169,  1.2500656843,  1.9160656929,  ...,
          -0.3587054014, -0.6806795001, -0.5678879023],
         [-0.5549470782,  0.3020634353,  0.2215119302,  ...,
          -0.2485001683, -0.8814989328,  0.7776247263],
         [-0.5584750175,  1.2743335962,  1.9794452190,  ...,
          -0.3425338268, -0.7754001021, -0.6752771139],
         ...,
         [ 0.0430632308,  0.0613280237,  0.3762882352,  ...,
           0.0973920226,  1.6304383278, -2.3274102211],
         [ 0.1496249735, -0.3039065003,  0.1800004691,  ...,
           0.7147076726, -0.3562183976, -1.8467016220],
         [-0.5398339033,  0.1594856083,  0.2489071935,  ...,
          -0.6237655282,  0.5928454399, -2.6226317883]]])
tensor([[[-0.5219924450,  1.2499254942,  1.9160754681,  ...,
          -0.3581647277, -0.6810475588, -0.5687916279],
         [-0.5553250313,  0.3030800223,  0.2204785347,  ...,
          -0.2485207915, -0.8809019923,  0.7772969007],
         [-0.5583836436,  1.2744653225,  1.97816

In [12]:
changed_layers = {}
for name, m in model.model.named_modules():
    if isinstance(m, qLinearLayer.QLinearLayer):
      layer_v2 = qLinearLayer.QLinearLayerV2()
      layer_v2.args = m.args
      layer_v2.weight = m.weight
      layer_v2.bias = m.bias
      changed_layers[name] = layer_v2

In [13]:
for name, layer in changed_layers.items():
    analyze.set_nested_attr(model.model, name, layer)

In [14]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x QLlamaDecoderLayer(
        (self_attn): QLlamaAttention(
          (q_proj): QLinearLayerV2()
          (k_proj): QLinearLayerV2()
          (v_proj): QLinearLayerV2()
          (o_proj): QLinearLayerV2()
          (rotary_emb): LlamaRotaryEmbedding()
          (act_quant): Quantizer()
          (v_quant): Quantizer()
          (k_quant): Quantizer()
        )
        (mlp): QLlamaMLP(
          (gate_proj): QLinearLayerV2()
          (down_proj): QLinearLayerV2()
          (up_proj): QLinearLayerV2()
          (act_fn): SiLU()
          (act_quant): Quantizer()
        )
        (input_layernorm): QLlamaRMSNorm(
          (originalNorm): LlamaRMSNorm()
          (act_quant): Quantizer()
        )
        (post_attention_layernorm): QLlamaRMSNorm(
          (originalNorm): LlamaRMSNorm()
          (act_quant): Quantizer()
        )
      )
    )
    (norm): 

In [14]:
datasets = ['wikitext2']

for dataset in datasets:
    dataloader, testloader = get_loaders(
        dataset, seed=0, model="./llama2-7b", seqlen=model.seqlen
    )

    print(f"Evaluating {dataset} ...")
    ppl = llama_eval(model, testloader, DEV)

    print(f"targetResult,{dataset},{ppl:.3f}")

Evaluating wikitext2 ...


100%|██████████| 32/32 [1:20:27<00:00, 150.86s/it]


targetResult,wikitext2,nan


In [18]:
parser = argparse.ArgumentParser()

parser.add_argument(
    'model', type=str,
    help='LlaMa model to load; pass location of hugginface converted checkpoint.'
)
parser.add_argument(
    'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
    help='Where to extract calibration data from.'
)
parser.add_argument(
    '--seed',
    type=int, default=0, 
    help='Seed for sampling the calibration data.'
)
parser.add_argument(
    '--nsamples', type=int, default=128,
    help='Number of calibration data samples.'
)
# Quantization Method
parser.add_argument(
    '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16],
    help='#bits to use for quantizing weight; use 16 for evaluating base model.'
)
parser.add_argument(
    '--abits', type=int, default=16, choices=[2, 3, 4, 8, 16],
    help='#bits to use for quantizing activation; use 16 for evaluating base model.'
)
parser.add_argument(
    '--exponential', action='store_true',
    help='Whether to use exponent-only for weight quantization.'
)
parser.add_argument(
    '--a_sym', action='store_true',
    help='Whether to perform symmetric quantization. Default is asymmetric.'
)
parser.add_argument(
    '--w_sym', action='store_true',
    help='Whether to perform symmetric quantization. Default is asymmetric.'
)
parser.add_argument(
    '--static', action='store_true',
    help='Whether to perform static quantization (For activtions). Default is dynamic. (Deprecated in Atom)'
)
parser.add_argument(
    '--weight_group_size', type=int, default=0, choices=[0, 32, 64, 128, 256, 384, 768],
    help='Group size when quantizing weights. Using 128 as default quantization group.'
)
parser.add_argument( #- ??
    '--weight_channel_group', type=int, default=1,
    help='Group size of channels that will quantize together. (only for weights now)'
)
parser.add_argument(
    '--act_group_size', type=int, default=0, choices=[0, 64, 128, 256, 384, 768],
    help='Group size when quantizing activations. Using 128 as default quantization group.'
)
parser.add_argument(
    '--reorder', action='store_true',
    help='Whether to keep salient weight unquantized.'
)
parser.add_argument(
    '--act_sort_metric', type=str, default='hessian', choices=['abs_mean', 'hessian'],
    help='The metric used to sort the activations.'
)
parser.add_argument(
    '--keeper', type=int, default=0,
    help='Group size to keep outliers.'
)
parser.add_argument(
    '--keeper_precision', type=int, default=0, choices=[0, 1, 2, 3],
    help='Precision to keep outliers. 0 for FP16; 1 for E5M2; 2 for E4M3; 3 for INT8 Quant.'
)
parser.add_argument(
    '--cache_index', action='store_true',
    help='Whether to use cached reorder index'
)
parser.add_argument(
    '--tiling', type=int, default=0, choices=[0, 16],
    help='Tile-wise quantization granularity (Deprecated in Atom).'
)
parser.add_argument(
    '--kv_cache', action='store_true',
    help='Whether to quant KV_Cache'
)
parser.add_argument(
    '--use_gptq', action='store_true',
    help='Whether to use GPTQ for weight quantization.'
)
parser.add_argument(
    '--percdamp', type=float, default=.01,
    help='Percent of the average Hessian diagonal to use for dampening.'
)
parser.add_argument(
    '--a_clip_ratio', type=float, default=1.0,
    help='Clip ratio for activation quantization. new_max = max * clip_ratio'
)
parser.add_argument(
    '--w_clip_ratio', type=float, default=1.0,
    help='Clip ratio for weight quantization. new_max = max * clip_ratio'
)
parser.add_argument(
    '--kv_clip_ratio', type=float, default=1.0,
    help='Clip ratio for kv cache quantization. new_max = max * clip_ratio'
)
parser.add_argument(
    "--eval_ppl", action="store_true",
    help='Whether to evaluate perplexity.'
)
parser.add_argument(
    "--eval_common_sense", action="store_true",
    help='Whether to evaluate zero-shot accuray on commonsense reasoning tasks.'
)
parser.add_argument(
    "--multigpu", action="store_true", 
    help="at eval, map model to multiple gpus"
)
parser.add_argument(
    "--lm_eval_num_fewshot", type=int, default=0, 
    help="Number of shots in lm evaluation. Default is 0 for zero-shot."
)
parser.add_argument(
    "--lm_eval_limit", type=int, default=-1, 
    help="Limit the number of examples in lm evaluation"
)
parser.add_argument(
    '--save_dir', type=str, default='./saved',
    help='Path to store the reordering indices and quantized weights.'
)
parser.add_argument(
    '--quant_type', type=str, default='int', choices=['int', 'fp'],
    help='Determine the mapped data format by quant_type + n_bits. e.g. int8, fp4.'
)
parser.add_argument(
    '--save_model', action="store_true", default=True,
    help='Whether to save the quantized model.'
)

args = parser.parse_args(
  args = [
    "/root/project/Atom/llama2-7b",
    "wikitext2",
    "--wbits", "4", "--abits", "4", "--a_sym", "--w_sym", "--save_model",
    "--act_group_size", "128", "--weight_group_size", "128", "--weight_channel_group", "2",
    "--reorder", "--act_sort_metric", "hessian", "--cache_index",
    "--a_clip_ratio", "0.9", "--w_clip_ratio", "0.85", "--kv_clip_ratio", "1.0",
    "--keeper", "128", "--keeper_precision", "3", "--kv_cache", "--use_gptq",
    "--eval_common_sense", "--lm_eval_limit", "-1", "--multigpu"
  ]
)

lm = LMClass(args, model)
lm.seqlen = 2048
lm.model.eval()
for param in lm.model.parameters():
    param.requires_grad = False

if args.multigpu:
    if ("llama" in args.model.lower()) or ("mixtral" in args.model.lower()):
        map_layers_to_multi_gpus(lm.model.model.layers)
        input_device = lm.model.model.layers[0].device
        output_device = lm.model.model.layers[-1].device
        assert input_device == output_device
        lm._device = input_device
        lm.model.model.embed_tokens.to(input_device)
        lm.model.model.norm.to(output_device)
        lm.model.lm_head.to(output_device)
    elif "opt" in args.model.lower():
        map_layers_to_multi_gpus(lm.model.model.decoder.layers)
        input_device = lm.model.model.decoder.layers[0].device
        output_device = lm.model.model.decoder.layers[-1].device
        assert input_device == output_device
        lm._device = input_device
        lm.model.model.decoder.embed_tokens.to(input_device)
        lm.model.model.decoder.embed_positions.to(input_device)
        lm.model.model.decoder.final_layer_norm.to(input_device)
        lm.model.lm_head.to(output_device)
else:
    lm._device = DEV
    lm.model = lm.model.to(lm.device)

results = {}
# tasks_str = "piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande"
# tasks_str = "hellaswag,winogrande"
# tasks_str = "hellaswag"
tasks_str = "winogrande"
task_names = pattern_match(tasks_str.split(","), lm_tasks.ALL_TASKS)
print(f"Selected Tasks: {task_names}")

task_dict = lm_tasks.get_task_dict(task_names)
t_results = lm_evaluator.evaluate(
    lm,
    task_dict,
    num_fewshot=args.lm_eval_num_fewshot,
    limit=None if args.lm_eval_limit == -1 else args.lm_eval_limit
)
results.update(t_results)
pprint(results)

results_dict = results['results']
for task_name in tasks_str.split(','):
    if task_name in ['piqa', 'arc_easy', 'arc_challenge', 'hellaswag']:
        print(f"INFO {task_name} : {results_dict[task_name]['acc_norm']*100:.2f}")
    else:
        print(f"INFO {task_name} : {results_dict[task_name]['acc']*100:.2f}")

vocab size:  32000
map layer 0 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 1 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 2 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 3 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 4 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 5 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 6 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 7 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 8 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576, 13664)]
map layer 9 to gpu 2, [(0, 24576, 14462), (1, 24576, 14086), (2, 24576, 13658), (3, 24576,

 40%|████      | 1018/2534 [5:43:46<8:31:56, 20.26s/it]


KeyboardInterrupt: 

# Test ACIM GEMM Layer